{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100.0, "global_step": 493, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/mean_length": 815.7708740234375, "completions/min_length": 702.0, "entropy/max": 0.2529296875, "entropy/mean": 0.18994140625, "entropy/min": 0.14794921875, "epoch": 0.002028397565922921, "frac_reward_zero_std": 0.0, "grad_norm": 0.11101870983839035, "learning_rate": 2e-07, "loss": -0.010120062157511711, "reward": 1.8341436386108398, "reward_std": 0.23365744948387146, "rewards/DiagnosisAccuracyORM/mean": 0.46145835518836975, "rewards/DiagnosisAccuracyORM/std": 0.29587703943252563, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.37268519401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.1417359858751297, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 817.625, "completions/min_length": 672.0, "entropy/max": 0.353515625, "entropy/mean": 0.2109375, "entropy/min": 0.12939453125, "epoch": 0.004056795131845842, "frac_reward_zero_std": 0.0, "grad_norm": 0.10487744957208633, "learning_rate": 4e-07, "loss": -0.004713843576610088, "reward": 1.7354663610458374, "reward_std": 0.23017507791519165, "rewards/DiagnosisAccuracyORM/mean": 0.44900795817375183, "rewards/DiagnosisAccuracyORM/std": 0.23968055844306946, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2864583432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.13622218370437622, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/mean_length": 820.8958740234375, "completions/min_length": 691.0, "entropy/max": 0.3740234375, "entropy/mean": 0.22265625, "entropy/min": 0.15234375, "epoch": 0.006085192697768763, "frac_reward_zero_std": 0.0, "grad_norm": 0.14589230716228485, "learning_rate": 6e-07, "loss": -0.013054202310740948, "reward": 1.7054314613342285, "reward_std": 0.23437750339508057, "rewards/DiagnosisAccuracyORM/mean": 0.40219080448150635, "rewards/DiagnosisAccuracyORM/std": 0.2545088231563568, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30324074625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.148523211479187, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/mean_length": 812.9583740234375, "completions/min_length": 677.0, "entropy/max": 0.314453125, "entropy/mean": 0.21142578125, "entropy/min": 0.125, "epoch": 0.008113590263691683, "frac_reward_zero_std": 0.0, "grad_norm": 0.12062899023294449, "learning_rate": 8e-07, "loss": 0.0022564493119716644, "reward": 1.8589617013931274, "reward_std": 0.2550734579563141, "rewards/DiagnosisAccuracyORM/mean": 0.5319940447807312, "rewards/DiagnosisAccuracyORM/std": 0.28946661949157715, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32696762681007385, "rewards/KeyDiagnosticEvidenceORM/std": 0.15653185546398163, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/mean_length": 808.3958740234375, "completions/min_length": 573.0, "entropy/max": 0.3056640625, "entropy/mean": 0.21142578125, "entropy/min": 0.13623046875, "epoch": 0.010141987829614604, "frac_reward_zero_std": 0.0, "grad_norm": 0.12531079351902008, "learning_rate": 1e-06, "loss": 0.005617068614810705, "reward": 1.6768519878387451, "reward_std": 0.24728178977966309, "rewards/DiagnosisAccuracyORM/mean": 0.42222222685813904, "rewards/DiagnosisAccuracyORM/std": 0.3076513111591339, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25462964177131653, "rewards/KeyDiagnosticEvidenceORM/std": 0.09930367767810822, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/mean_length": 807.3958740234375, "completions/min_length": 688.0, "entropy/max": 0.4365234375, "entropy/mean": 0.234375, "entropy/min": 0.14990234375, "epoch": 0.012170385395537525, "frac_reward_zero_std": 0.0, "grad_norm": 0.14344868063926697, "learning_rate": 9.99989639073087e-07, "loss": 0.009232744574546814, "reward": 1.6659722328186035, "reward_std": 0.2080886960029602, "rewards/DiagnosisAccuracyORM/mean": 0.4055555760860443, "rewards/DiagnosisAccuracyORM/std": 0.2360864132642746, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2604166567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.10896644741296768, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/mean_length": 806.0416870117188, "completions/min_length": 673.0, "entropy/max": 0.283203125, "entropy/mean": 0.19921875, "entropy/min": 0.13623046875, "epoch": 0.014198782961460446, "frac_reward_zero_std": 0.0, "grad_norm": 0.10785648971796036, "learning_rate": 9.999585567217438e-07, "loss": -0.005804389715194702, "reward": 1.8398313522338867, "reward_std": 0.16480009257793427, "rewards/DiagnosisAccuracyORM/mean": 0.5446924567222595, "rewards/DiagnosisAccuracyORM/std": 0.2602759897708893, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2951388955116272, "rewards/KeyDiagnosticEvidenceORM/std": 0.08491892367601395, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/mean_length": 829.3333740234375, "completions/min_length": 686.0, "entropy/max": 0.4111328125, "entropy/mean": 0.228515625, "entropy/min": 0.14111328125, "epoch": 0.016227180527383367, "frac_reward_zero_std": 0.0, "grad_norm": 0.12130726873874664, "learning_rate": 9.999067542341378e-07, "loss": 0.021407222375273705, "reward": 1.8340774774551392, "reward_std": 0.29184281826019287, "rewards/DiagnosisAccuracyORM/mean": 0.5424107313156128, "rewards/DiagnosisAccuracyORM/std": 0.33900752663612366, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2916666567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.1450749933719635, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 810.4166870117188, "completions/min_length": 590.0, "entropy/max": 0.294921875, "entropy/mean": 0.20166015625, "entropy/min": 0.13818359375, "epoch": 0.018255578093306288, "frac_reward_zero_std": 0.0, "grad_norm": 0.13188520073890686, "learning_rate": 9.998342337571565e-07, "loss": -0.005243720021098852, "reward": 1.6496444940567017, "reward_std": 0.21206626296043396, "rewards/DiagnosisAccuracyORM/mean": 0.3750495910644531, "rewards/DiagnosisAccuracyORM/std": 0.2647578716278076, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27459490299224854, "rewards/KeyDiagnosticEvidenceORM/std": 0.08915074914693832, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1092.0, "completions/mean_length": 822.4791870117188, "completions/min_length": 674.0, "entropy/max": 0.41015625, "entropy/mean": 0.228515625, "entropy/min": 0.15673828125, "epoch": 0.02028397565922921, "frac_reward_zero_std": 0.0, "grad_norm": 0.12816795706748962, "learning_rate": 9.997409982963171e-07, "loss": 0.011149754747748375, "reward": 1.6415510177612305, "reward_std": 0.21493875980377197, "rewards/DiagnosisAccuracyORM/mean": 0.39560186862945557, "rewards/DiagnosisAccuracyORM/std": 0.31132781505584717, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24594907462596893, "rewards/KeyDiagnosticEvidenceORM/std": 0.1158829927444458, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/mean_length": 821.5208740234375, "completions/min_length": 669.0, "entropy/max": 0.4814453125, "entropy/mean": 0.23193359375, "entropy/min": 0.1572265625, "epoch": 0.02231237322515213, "frac_reward_zero_std": 0.0, "grad_norm": 0.11357872933149338, "learning_rate": 9.99627051715643e-07, "loss": 0.00038862600922584534, "reward": 1.6372685432434082, "reward_std": 0.18441276252269745, "rewards/DiagnosisAccuracyORM/mean": 0.3517361581325531, "rewards/DiagnosisAccuracyORM/std": 0.2933107614517212, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2855324149131775, "rewards/KeyDiagnosticEvidenceORM/std": 0.11365146189928055, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/mean_length": 819.7083740234375, "completions/min_length": 672.0, "entropy/max": 0.298828125, "entropy/mean": 0.21923828125, "entropy/min": 0.14794921875, "epoch": 0.02434077079107505, "frac_reward_zero_std": 0.0, "grad_norm": 0.1091248095035553, "learning_rate": 9.994923987375028e-07, "loss": 0.0018920451402664185, "reward": 1.7971065044403076, "reward_std": 0.2986885607242584, "rewards/DiagnosisAccuracyORM/mean": 0.46840277314186096, "rewards/DiagnosisAccuracyORM/std": 0.3071688711643219, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32870373129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.10602008551359177, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/mean_length": 810.4375, "completions/min_length": 598.0, "entropy/max": 0.42578125, "entropy/mean": 0.23974609375, "entropy/min": 0.14208984375, "epoch": 0.02636916835699797, "frac_reward_zero_std": 0.0, "grad_norm": 0.12784306704998016, "learning_rate": 9.993370449424152e-07, "loss": -0.013183921575546265, "reward": 1.7361111640930176, "reward_std": 0.21626242995262146, "rewards/DiagnosisAccuracyORM/mean": 0.425347238779068, "rewards/DiagnosisAccuracyORM/std": 0.3348191976547241, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3107638955116272, "rewards/KeyDiagnosticEvidenceORM/std": 0.16091904044151306, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/mean_length": 829.625, "completions/min_length": 700.0, "entropy/max": 0.337890625, "entropy/mean": 0.2255859375, "entropy/min": 0.1376953125, "epoch": 0.028397565922920892, "frac_reward_zero_std": 0.0, "grad_norm": 0.12385847419500351, "learning_rate": 9.991609967688176e-07, "loss": 0.009098611772060394, "reward": 1.639831304550171, "reward_std": 0.24413131177425385, "rewards/DiagnosisAccuracyORM/mean": 0.3516368865966797, "rewards/DiagnosisAccuracyORM/std": 0.3264697790145874, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2881944477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.16702336072921753, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/mean_length": 819.125, "completions/min_length": 664.0, "entropy/max": 0.4208984375, "entropy/mean": 0.23876953125, "entropy/min": 0.14453125, "epoch": 0.030425963488843813, "frac_reward_zero_std": 0.0, "grad_norm": 0.15057942271232605, "learning_rate": 9.989642615127988e-07, "loss": 0.0011490783654153347, "reward": 1.6871860027313232, "reward_std": 0.22904101014137268, "rewards/DiagnosisAccuracyORM/mean": 0.38452377915382385, "rewards/DiagnosisAccuracyORM/std": 0.24372275173664093, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30266204476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.15463247895240784, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/mean_length": 829.5, "completions/min_length": 692.0, "entropy/max": 0.3564453125, "entropy/mean": 0.21142578125, "entropy/min": 0.13916015625, "epoch": 0.032454361054766734, "frac_reward_zero_std": 0.0, "grad_norm": 0.12490449845790863, "learning_rate": 9.987468473277974e-07, "loss": -0.023609867319464684, "reward": 1.7837053537368774, "reward_std": 0.24346211552619934, "rewards/DiagnosisAccuracyORM/mean": 0.44516369700431824, "rewards/DiagnosisAccuracyORM/std": 0.3163755536079407, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3385416567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.12028557062149048, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/mean_length": 823.0833740234375, "completions/min_length": 697.0, "entropy/max": 0.267578125, "entropy/mean": 0.19091796875, "entropy/min": 0.12451171875, "epoch": 0.034482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.11966145038604736, "learning_rate": 9.985087632242632e-07, "loss": 0.006627271883189678, "reward": 1.810929298400879, "reward_std": 0.19529034197330475, "rewards/DiagnosisAccuracyORM/mean": 0.5146329402923584, "rewards/DiagnosisAccuracyORM/std": 0.26819562911987305, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2962963283061981, "rewards/KeyDiagnosticEvidenceORM/std": 0.14623968303203583, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/mean_length": 811.3125, "completions/min_length": 654.0, "entropy/max": 0.3271484375, "entropy/mean": 0.208984375, "entropy/min": 0.138671875, "epoch": 0.036511156186612576, "frac_reward_zero_std": 0.0, "grad_norm": 0.14608757197856903, "learning_rate": 9.982500190692844e-07, "loss": -0.00030716758919879794, "reward": 1.7392526865005493, "reward_std": 0.18237197399139404, "rewards/DiagnosisAccuracyORM/mean": 0.44353508949279785, "rewards/DiagnosisAccuracyORM/std": 0.29135367274284363, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29571759700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.1481531411409378, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/mean_length": 813.6458740234375, "completions/min_length": 613.0, "entropy/max": 0.39453125, "entropy/mean": 0.2509765625, "entropy/min": 0.16015625, "epoch": 0.038539553752535496, "frac_reward_zero_std": 0.0, "grad_norm": 0.18046380579471588, "learning_rate": 9.97970625586178e-07, "loss": 0.012956123799085617, "reward": 1.7005207538604736, "reward_std": 0.31487053632736206, "rewards/DiagnosisAccuracyORM/mean": 0.3845486342906952, "rewards/DiagnosisAccuracyORM/std": 0.29987287521362305, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.315972238779068, "rewards/KeyDiagnosticEvidenceORM/std": 0.14993049204349518, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/mean_length": 822.2083740234375, "completions/min_length": 653.0, "entropy/max": 0.294921875, "entropy/mean": 0.21435546875, "entropy/min": 0.131103515625, "epoch": 0.04056795131845842, "frac_reward_zero_std": 0.0, "grad_norm": 0.11420976370573044, "learning_rate": 9.976705943540458e-07, "loss": -0.000745462893974036, "reward": 1.7662203311920166, "reward_std": 0.2398633360862732, "rewards/DiagnosisAccuracyORM/mean": 0.42073413729667664, "rewards/DiagnosisAccuracyORM/std": 0.27004480361938477, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3454860746860504, "rewards/KeyDiagnosticEvidenceORM/std": 0.19906748831272125, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/mean_length": 812.4791870117188, "completions/min_length": 678.0, "entropy/max": 0.3837890625, "entropy/mean": 0.22265625, "entropy/min": 0.1533203125, "epoch": 0.04259634888438134, "frac_reward_zero_std": 0.0, "grad_norm": 0.14904730021953583, "learning_rate": 9.973499378072946e-07, "loss": 0.005483110900968313, "reward": 1.621759295463562, "reward_std": 0.21575690805912018, "rewards/DiagnosisAccuracyORM/mean": 0.3659721910953522, "rewards/DiagnosisAccuracyORM/std": 0.25695890188217163, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25578704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.1282520592212677, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/mean_length": 841.4583740234375, "completions/min_length": 712.0, "entropy/max": 0.4189453125, "entropy/mean": 0.23779296875, "entropy/min": 0.15966796875, "epoch": 0.04462474645030426, "frac_reward_zero_std": 0.0, "grad_norm": 0.1351945996284485, "learning_rate": 9.970086692351202e-07, "loss": -0.004326686263084412, "reward": 1.6689815521240234, "reward_std": 0.20868469774723053, "rewards/DiagnosisAccuracyORM/mean": 0.4201388657093048, "rewards/DiagnosisAccuracyORM/std": 0.32910364866256714, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24884259700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.10976703464984894, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/mean_length": 823.7708740234375, "completions/min_length": 671.0, "entropy/max": 0.298828125, "entropy/mean": 0.2119140625, "entropy/min": 0.14501953125, "epoch": 0.04665314401622718, "frac_reward_zero_std": 0.0, "grad_norm": 0.1293584257364273, "learning_rate": 9.96646802780958e-07, "loss": -0.009899398311972618, "reward": 1.73587965965271, "reward_std": 0.18836289644241333, "rewards/DiagnosisAccuracyORM/mean": 0.44305554032325745, "rewards/DiagnosisAccuracyORM/std": 0.2097739726305008, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29282405972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.12262430042028427, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 804.1041870117188, "completions/min_length": 615.0, "entropy/max": 0.4267578125, "entropy/mean": 0.2265625, "entropy/min": 0.138671875, "epoch": 0.0486815415821501, "frac_reward_zero_std": 0.0, "grad_norm": 0.14609095454216003, "learning_rate": 9.962643534418953e-07, "loss": -0.0010986527195200324, "reward": 1.6261491775512695, "reward_std": 0.20542484521865845, "rewards/DiagnosisAccuracyORM/mean": 0.3582093417644501, "rewards/DiagnosisAccuracyORM/std": 0.21603897213935852, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26793983578681946, "rewards/KeyDiagnosticEvidenceORM/std": 0.13994745910167694, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1058.0, "completions/mean_length": 811.125, "completions/min_length": 658.0, "entropy/max": 0.302734375, "entropy/mean": 0.22216796875, "entropy/min": 0.150390625, "epoch": 0.05070993914807302, "frac_reward_zero_std": 0.0, "grad_norm": 0.12023015320301056, "learning_rate": 9.958613370680507e-07, "loss": 0.01910499297082424, "reward": 1.817857265472412, "reward_std": 0.2996102273464203, "rewards/DiagnosisAccuracyORM/mean": 0.5036210417747498, "rewards/DiagnosisAccuracyORM/std": 0.32983726263046265, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3142361342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.11884384602308273, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/mean_length": 787.1875, "completions/min_length": 592.0, "entropy/max": 0.3232421875, "entropy/mean": 0.228515625, "entropy/min": 0.15380859375, "epoch": 0.05273833671399594, "frac_reward_zero_std": 0.0, "grad_norm": 0.12466000765562057, "learning_rate": 9.95437770361917e-07, "loss": -0.0032345852814614773, "reward": 1.7862434387207031, "reward_std": 0.14962677657604218, "rewards/DiagnosisAccuracyORM/mean": 0.5148313641548157, "rewards/DiagnosisAccuracyORM/std": 0.2647479772567749, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27141204476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.1280318647623062, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 806.7083740234375, "completions/min_length": 642.0, "entropy/max": 0.267578125, "entropy/mean": 0.17919921875, "entropy/min": 0.127197265625, "epoch": 0.05476673427991886, "frac_reward_zero_std": 0.0, "grad_norm": 0.13575249910354614, "learning_rate": 9.94993670877669e-07, "loss": -0.011778771877288818, "reward": 1.5785549879074097, "reward_std": 0.14991815388202667, "rewards/DiagnosisAccuracyORM/mean": 0.31755951046943665, "rewards/DiagnosisAccuracyORM/std": 0.21599210798740387, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26099535822868347, "rewards/KeyDiagnosticEvidenceORM/std": 0.13568885624408722, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/mean_length": 816.6041870117188, "completions/min_length": 632.0, "entropy/max": 0.4794921875, "entropy/mean": 0.23974609375, "entropy/min": 0.14404296875, "epoch": 0.056795131845841784, "frac_reward_zero_std": 0.0, "grad_norm": 0.12944690883159637, "learning_rate": 9.94529057020436e-07, "loss": 0.0067995889112353325, "reward": 1.778753399848938, "reward_std": 0.34154585003852844, "rewards/DiagnosisAccuracyORM/mean": 0.4903273582458496, "rewards/DiagnosisAccuracyORM/std": 0.32939285039901733, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28842592239379883, "rewards/KeyDiagnosticEvidenceORM/std": 0.13682310283184052, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/mean_length": 817.0625, "completions/min_length": 643.0, "entropy/max": 0.287109375, "entropy/mean": 0.2041015625, "entropy/min": 0.14794921875, "epoch": 0.058823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 0.11445114761590958, "learning_rate": 9.940439480455385e-07, "loss": -0.018469927832484245, "reward": 1.7417658567428589, "reward_std": 0.2571311593055725, "rewards/DiagnosisAccuracyORM/mean": 0.4205853044986725, "rewards/DiagnosisAccuracyORM/std": 0.2752155363559723, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3211805522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.12901118397712708, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/mean_length": 798.8541870117188, "completions/min_length": 629.0, "entropy/max": 0.31640625, "entropy/mean": 0.2265625, "entropy/min": 0.15625, "epoch": 0.060851926977687626, "frac_reward_zero_std": 0.0, "grad_norm": 0.12065097689628601, "learning_rate": 9.935383640576913e-07, "loss": -0.006367970257997513, "reward": 1.8542823791503906, "reward_std": 0.22766388952732086, "rewards/DiagnosisAccuracyORM/mean": 0.5788194537162781, "rewards/DiagnosisAccuracyORM/std": 0.3115343451499939, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2754629850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.11500753462314606, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/mean_length": 813.9583740234375, "completions/min_length": 664.0, "entropy/max": 0.421875, "entropy/mean": 0.23046875, "entropy/min": 0.13818359375, "epoch": 0.06288032454361055, "frac_reward_zero_std": 0.0, "grad_norm": 0.14535082876682281, "learning_rate": 9.930123260101696e-07, "loss": -0.0057922303676605225, "reward": 1.7729166746139526, "reward_std": 0.2898646295070648, "rewards/DiagnosisAccuracyORM/mean": 0.5136574506759644, "rewards/DiagnosisAccuracyORM/std": 0.315886914730072, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25925925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.11831456422805786, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/mean_length": 808.75, "completions/min_length": 569.0, "entropy/max": 0.4345703125, "entropy/mean": 0.234375, "entropy/min": 0.14208984375, "epoch": 0.06490872210953347, "frac_reward_zero_std": 0.0, "grad_norm": 0.12251468002796173, "learning_rate": 9.924658557039398e-07, "loss": -0.024919670075178146, "reward": 1.6328538656234741, "reward_std": 0.23279626667499542, "rewards/DiagnosisAccuracyORM/mean": 0.324404776096344, "rewards/DiagnosisAccuracyORM/std": 0.22457751631736755, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30844905972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.12994103133678436, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/mean_length": 815.875, "completions/min_length": 699.0, "entropy/max": 0.3466796875, "entropy/mean": 0.232421875, "entropy/min": 0.158203125, "epoch": 0.06693711967545639, "frac_reward_zero_std": 0.0, "grad_norm": 0.11852788180112839, "learning_rate": 9.918989757867583e-07, "loss": -0.007220124360173941, "reward": 1.5610780715942383, "reward_std": 0.25571465492248535, "rewards/DiagnosisAccuracyORM/mean": 0.32033732533454895, "rewards/DiagnosisAccuracyORM/std": 0.2347654551267624, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24074073135852814, "rewards/KeyDiagnosticEvidenceORM/std": 0.09066150337457657, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 818.5833740234375, "completions/min_length": 666.0, "entropy/max": 0.314453125, "entropy/mean": 0.197265625, "entropy/min": 0.1435546875, "epoch": 0.06896551724137931, "frac_reward_zero_std": 0.0, "grad_norm": 0.1202191710472107, "learning_rate": 9.913117097522298e-07, "loss": 0.0015608991961926222, "reward": 1.6875827312469482, "reward_std": 0.16110453009605408, "rewards/DiagnosisAccuracyORM/mean": 0.3883928954601288, "rewards/DiagnosisAccuracyORM/std": 0.28155219554901123, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29918980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.08656833320856094, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/mean_length": 824.6041870117188, "completions/min_length": 668.0, "entropy/max": 0.3935546875, "entropy/mean": 0.23681640625, "entropy/min": 0.1357421875, "epoch": 0.07099391480730223, "frac_reward_zero_std": 0.0, "grad_norm": 0.13335521519184113, "learning_rate": 9.907040819388371e-07, "loss": 0.019520573318004608, "reward": 1.7518932819366455, "reward_std": 0.328674852848053, "rewards/DiagnosisAccuracyORM/mean": 0.455596923828125, "rewards/DiagnosisAccuracyORM/std": 0.3126849830150604, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2962963283061981, "rewards/KeyDiagnosticEvidenceORM/std": 0.14934992790222168, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/mean_length": 819.2708740234375, "completions/min_length": 705.0, "entropy/max": 0.28125, "entropy/mean": 0.208984375, "entropy/min": 0.12890625, "epoch": 0.07302231237322515, "frac_reward_zero_std": 0.0, "grad_norm": 0.14652462303638458, "learning_rate": 9.900761175289288e-07, "loss": 0.008400156162679195, "reward": 1.7985780239105225, "reward_std": 0.2164909392595291, "rewards/DiagnosisAccuracyORM/mean": 0.4415178596973419, "rewards/DiagnosisAccuracyORM/std": 0.249309703707695, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35706019401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.08689957112073898, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/mean_length": 825.1041870117188, "completions/min_length": 636.0, "entropy/max": 0.287109375, "entropy/mean": 0.21875, "entropy/min": 0.15185546875, "epoch": 0.07505070993914807, "frac_reward_zero_std": 0.0, "grad_norm": 0.12908202409744263, "learning_rate": 9.894278425476788e-07, "loss": -0.004232665058225393, "reward": 1.7921957969665527, "reward_std": 0.22640350461006165, "rewards/DiagnosisAccuracyORM/mean": 0.4947420656681061, "rewards/DiagnosisAccuracyORM/std": 0.26500922441482544, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29745370149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.15353704988956451, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/mean_length": 817.5833740234375, "completions/min_length": 588.0, "entropy/max": 0.3154296875, "entropy/mean": 0.2080078125, "entropy/min": 0.14697265625, "epoch": 0.07707910750507099, "frac_reward_zero_std": 0.0, "grad_norm": 0.1301000863313675, "learning_rate": 9.88759283862006e-07, "loss": -0.00966302677989006, "reward": 1.8179564476013184, "reward_std": 0.2630407214164734, "rewards/DiagnosisAccuracyORM/mean": 0.4648313522338867, "rewards/DiagnosisAccuracyORM/std": 0.31491342186927795, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3531250059604645, "rewards/KeyDiagnosticEvidenceORM/std": 0.13564246892929077, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/mean_length": 834.9791870117188, "completions/min_length": 664.0, "entropy/max": 0.400390625, "entropy/mean": 0.2294921875, "entropy/min": 0.14990234375, "epoch": 0.07910750507099391, "frac_reward_zero_std": 0.0, "grad_norm": 0.11817663908004761, "learning_rate": 9.880704691794607e-07, "loss": 0.0231487900018692, "reward": 1.8439816236495972, "reward_std": 0.27642393112182617, "rewards/DiagnosisAccuracyORM/mean": 0.5934028029441833, "rewards/DiagnosisAccuracyORM/std": 0.31079360842704773, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25057873129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.13517366349697113, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/mean_length": 836.5416870117188, "completions/min_length": 707.0, "entropy/max": 0.3388671875, "entropy/mean": 0.22216796875, "entropy/min": 0.1376953125, "epoch": 0.08113590263691683, "frac_reward_zero_std": 0.0, "grad_norm": 0.12760798633098602, "learning_rate": 9.873614270470777e-07, "loss": -0.008626642636954784, "reward": 1.7779431343078613, "reward_std": 0.1940496563911438, "rewards/DiagnosisAccuracyORM/mean": 0.5017856955528259, "rewards/DiagnosisAccuracyORM/std": 0.30275288224220276, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.276157408952713, "rewards/KeyDiagnosticEvidenceORM/std": 0.13444648683071136, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/mean_length": 832.125, "completions/min_length": 668.0, "entropy/max": 0.455078125, "entropy/mean": 0.24365234375, "entropy/min": 0.13623046875, "epoch": 0.08316430020283976, "frac_reward_zero_std": 0.0, "grad_norm": 0.13890305161476135, "learning_rate": 9.866321868501912e-07, "loss": -0.0040893033146858215, "reward": 1.6381779909133911, "reward_std": 0.2168230414390564, "rewards/DiagnosisAccuracyORM/mean": 0.33956679701805115, "rewards/DiagnosisAccuracyORM/std": 0.31737929582595825, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2986111342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.11029547452926636, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/mean_length": 826.3125, "completions/min_length": 679.0, "entropy/max": 0.40625, "entropy/mean": 0.22705078125, "entropy/min": 0.14111328125, "epoch": 0.08519269776876268, "frac_reward_zero_std": 0.0, "grad_norm": 0.14698617160320282, "learning_rate": 9.858827788112195e-07, "loss": -0.0020229320507496595, "reward": 1.853108525276184, "reward_std": 0.2876221537590027, "rewards/DiagnosisAccuracyORM/mean": 0.5348214507102966, "rewards/DiagnosisAccuracyORM/std": 0.32934749126434326, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31828704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.11991678178310394, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/mean_length": 825.4166870117188, "completions/min_length": 622.0, "entropy/max": 0.30859375, "entropy/mean": 0.21337890625, "entropy/min": 0.1494140625, "epoch": 0.0872210953346856, "frac_reward_zero_std": 0.0, "grad_norm": 0.1591872125864029, "learning_rate": 9.851132339884095e-07, "loss": 0.002341667888686061, "reward": 1.756638526916504, "reward_std": 0.20825034379959106, "rewards/DiagnosisAccuracyORM/mean": 0.48464784026145935, "rewards/DiagnosisAccuracyORM/std": 0.2729816734790802, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27199074625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.11036366969347, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/mean_length": 829.75, "completions/min_length": 727.0, "entropy/max": 0.26953125, "entropy/mean": 0.21337890625, "entropy/min": 0.154296875, "epoch": 0.08924949290060852, "frac_reward_zero_std": 0.0, "grad_norm": 0.1137363463640213, "learning_rate": 9.843235842745524e-07, "loss": -0.010735100135207176, "reward": 1.6980819702148438, "reward_std": 0.20218932628631592, "rewards/DiagnosisAccuracyORM/mean": 0.4122023582458496, "rewards/DiagnosisAccuracyORM/std": 0.2585403025150299, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28587961196899414, "rewards/KeyDiagnosticEvidenceORM/std": 0.1288905143737793, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/mean_length": 803.375, "completions/min_length": 693.0, "entropy/max": 0.30810546875, "entropy/mean": 0.1982421875, "entropy/min": 0.1298828125, "epoch": 0.09127789046653144, "frac_reward_zero_std": 0.0, "grad_norm": 0.12807482481002808, "learning_rate": 9.835138623956602e-07, "loss": -0.0022240481339395046, "reward": 1.6160879135131836, "reward_std": 0.18076574802398682, "rewards/DiagnosisAccuracyORM/mean": 0.3527778089046478, "rewards/DiagnosisAccuracyORM/std": 0.27740636467933655, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26331016421318054, "rewards/KeyDiagnosticEvidenceORM/std": 0.10250180214643478, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/mean_length": 823.4375, "completions/min_length": 667.0, "entropy/max": 0.275390625, "entropy/mean": 0.22705078125, "entropy/min": 0.16748046875, "epoch": 0.09330628803245436, "frac_reward_zero_std": 0.0, "grad_norm": 0.1352786272764206, "learning_rate": 9.826841019096094e-07, "loss": -0.009673903696238995, "reward": 1.7921462059020996, "reward_std": 0.16511498391628265, "rewards/DiagnosisAccuracyORM/mean": 0.5120535492897034, "rewards/DiagnosisAccuracyORM/std": 0.2375040203332901, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28009259700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.1056322529911995, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 795.4166870117188, "completions/min_length": 611.0, "entropy/max": 0.2958984375, "entropy/mean": 0.19921875, "entropy/min": 0.1474609375, "epoch": 0.09533468559837728, "frac_reward_zero_std": 0.0, "grad_norm": 0.11338211596012115, "learning_rate": 9.818343372047507e-07, "loss": 0.0059763966128230095, "reward": 1.6972222328186035, "reward_std": 0.19861575961112976, "rewards/DiagnosisAccuracyORM/mean": 0.4437500238418579, "rewards/DiagnosisAccuracyORM/std": 0.299539178609848, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.253472238779068, "rewards/KeyDiagnosticEvidenceORM/std": 0.08720798790454865, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/mean_length": 835.2083740234375, "completions/min_length": 651.0, "entropy/max": 0.3681640625, "entropy/mean": 0.22509765625, "entropy/min": 0.134765625, "epoch": 0.0973630831643002, "frac_reward_zero_std": 0.0, "grad_norm": 0.11714430153369904, "learning_rate": 9.809646034984849e-07, "loss": 0.0043884157203137875, "reward": 1.6929397583007812, "reward_std": 0.29502028226852417, "rewards/DiagnosisAccuracyORM/mean": 0.49965283274650574, "rewards/DiagnosisAccuracyORM/std": 0.3356037735939026, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.19328702986240387, "rewards/KeyDiagnosticEvidenceORM/std": 0.10329455137252808, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/mean_length": 808.4791870117188, "completions/min_length": 629.0, "entropy/max": 0.310546875, "entropy/mean": 0.208984375, "entropy/min": 0.15625, "epoch": 0.09939148073022312, "frac_reward_zero_std": 0.0, "grad_norm": 0.12517869472503662, "learning_rate": 9.800749368358007e-07, "loss": 0.01587693952023983, "reward": 1.7634508609771729, "reward_std": 0.20999068021774292, "rewards/DiagnosisAccuracyORM/mean": 0.4191220700740814, "rewards/DiagnosisAccuracyORM/std": 0.31536784768104553, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34432873129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.13532540202140808, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/mean_length": 821.875, "completions/min_length": 648.0, "entropy/max": 0.40625, "entropy/mean": 0.212890625, "entropy/min": 0.13134765625, "epoch": 0.10141987829614604, "frac_reward_zero_std": 0.0, "grad_norm": 0.11106550693511963, "learning_rate": 9.791653740877838e-07, "loss": 0.009475169703364372, "reward": 1.8036375045776367, "reward_std": 0.24643665552139282, "rewards/DiagnosisAccuracyORM/mean": 0.47435513138771057, "rewards/DiagnosisAccuracyORM/std": 0.2740626037120819, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32928240299224854, "rewards/KeyDiagnosticEvidenceORM/std": 0.1573425680398941, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/mean_length": 829.7291870117188, "completions/min_length": 646.0, "entropy/max": 0.3310546875, "entropy/mean": 0.208984375, "entropy/min": 0.14306640625, "epoch": 0.10344827586206896, "frac_reward_zero_std": 0.0, "grad_norm": 0.3002406656742096, "learning_rate": 9.782359529500866e-07, "loss": 0.0015572980046272278, "reward": 1.6655919551849365, "reward_std": 0.2679152190685272, "rewards/DiagnosisAccuracyORM/mean": 0.4179067611694336, "rewards/DiagnosisAccuracyORM/std": 0.2684316039085388, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24768519401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.1215316504240036, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/mean_length": 829.7291870117188, "completions/min_length": 694.0, "entropy/max": 0.4541015625, "entropy/mean": 0.24267578125, "entropy/min": 0.146484375, "epoch": 0.10547667342799188, "frac_reward_zero_std": 0.0, "grad_norm": 0.1260191947221756, "learning_rate": 9.772867119413665e-07, "loss": -0.0038258880376815796, "reward": 1.6140873432159424, "reward_std": 0.2145945131778717, "rewards/DiagnosisAccuracyORM/mean": 0.39533731341362, "rewards/DiagnosisAccuracyORM/std": 0.28312239050865173, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.21875, "rewards/KeyDiagnosticEvidenceORM/std": 0.11744994670152664, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/mean_length": 813.3125, "completions/min_length": 664.0, "entropy/max": 0.3447265625, "entropy/mean": 0.2216796875, "entropy/min": 0.121337890625, "epoch": 0.1075050709939148, "frac_reward_zero_std": 0.0, "grad_norm": 0.1234971359372139, "learning_rate": 9.763176904016913e-07, "loss": -0.011257842183113098, "reward": 1.7352018356323242, "reward_std": 0.25214534997940063, "rewards/DiagnosisAccuracyORM/mean": 0.4672619104385376, "rewards/DiagnosisAccuracyORM/std": 0.2560977637767792, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26793980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.10588935762643814, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/mean_length": 787.75, "completions/min_length": 651.0, "entropy/max": 0.3056640625, "entropy/mean": 0.19287109375, "entropy/min": 0.126708984375, "epoch": 0.10953346855983773, "frac_reward_zero_std": 0.0, "grad_norm": 0.12339981645345688, "learning_rate": 9.753289284909057e-07, "loss": 0.007378344889730215, "reward": 1.7533234357833862, "reward_std": 0.19893184304237366, "rewards/DiagnosisAccuracyORM/mean": 0.46512898802757263, "rewards/DiagnosisAccuracyORM/std": 0.3355076313018799, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2881944477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.12271352112293243, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/mean_length": 820.8958740234375, "completions/min_length": 637.0, "entropy/max": 0.337890625, "entropy/mean": 0.216796875, "entropy/min": 0.13037109375, "epoch": 0.11156186612576065, "frac_reward_zero_std": 0.0, "grad_norm": 0.13423022627830505, "learning_rate": 9.743204671869693e-07, "loss": -0.011871283873915672, "reward": 1.870940923690796, "reward_std": 0.24051596224308014, "rewards/DiagnosisAccuracyORM/mean": 0.5587301850318909, "rewards/DiagnosisAccuracyORM/std": 0.268095463514328, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31221067905426025, "rewards/KeyDiagnosticEvidenceORM/std": 0.09766404330730438, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/mean_length": 811.8333740234375, "completions/min_length": 679.0, "entropy/max": 0.427734375, "entropy/mean": 0.228515625, "entropy/min": 0.13818359375, "epoch": 0.11359026369168357, "frac_reward_zero_std": 0.0, "grad_norm": 0.12117735296487808, "learning_rate": 9.73292348284258e-07, "loss": -0.02879345417022705, "reward": 1.814980149269104, "reward_std": 0.2580046057701111, "rewards/DiagnosisAccuracyORM/mean": 0.526785671710968, "rewards/DiagnosisAccuracyORM/std": 0.32423147559165955, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2881944477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.1001746878027916, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/mean_length": 798.2083740234375, "completions/min_length": 691.0, "entropy/max": 0.3017578125, "entropy/mean": 0.2109375, "entropy/min": 0.15087890625, "epoch": 0.11561866125760649, "frac_reward_zero_std": 0.0, "grad_norm": 0.10940198600292206, "learning_rate": 9.722446143918305e-07, "loss": -0.002449125051498413, "reward": 1.7322752475738525, "reward_std": 0.2630045413970947, "rewards/DiagnosisAccuracyORM/mean": 0.4643353223800659, "rewards/DiagnosisAccuracyORM/std": 0.2761802077293396, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26793983578681946, "rewards/KeyDiagnosticEvidenceORM/std": 0.11985259503126144, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/mean_length": 799.7291870117188, "completions/min_length": 654.0, "entropy/max": 0.5078125, "entropy/mean": 0.23046875, "entropy/min": 0.138671875, "epoch": 0.11764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.14000700414180756, "learning_rate": 9.711773089316644e-07, "loss": -0.003048832295462489, "reward": 1.7162036895751953, "reward_std": 0.1277267038822174, "rewards/DiagnosisAccuracyORM/mean": 0.42569443583488464, "rewards/DiagnosisAccuracyORM/std": 0.26834484934806824, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29050925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.1295258104801178, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/mean_length": 816.4166870117188, "completions/min_length": 674.0, "entropy/max": 0.3359375, "entropy/mean": 0.2138671875, "entropy/min": 0.15478515625, "epoch": 0.11967545638945233, "frac_reward_zero_std": 0.0, "grad_norm": 0.14202475547790527, "learning_rate": 9.70090476136855e-07, "loss": 0.015340479090809822, "reward": 1.870039701461792, "reward_std": 0.2632606029510498, "rewards/DiagnosisAccuracyORM/mean": 0.5150049924850464, "rewards/DiagnosisAccuracyORM/std": 0.25265437364578247, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.355034738779068, "rewards/KeyDiagnosticEvidenceORM/std": 0.20934362709522247, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/mean_length": 797.1458740234375, "completions/min_length": 559.0, "entropy/max": 0.2880859375, "entropy/mean": 0.18505859375, "entropy/min": 0.13525390625, "epoch": 0.12170385395537525, "frac_reward_zero_std": 0.0, "grad_norm": 0.11199690401554108, "learning_rate": 9.689841610497827e-07, "loss": -0.008260255679488182, "reward": 1.7574901580810547, "reward_std": 0.204844668507576, "rewards/DiagnosisAccuracyORM/mean": 0.42068448662757874, "rewards/DiagnosisAccuracyORM/std": 0.27858036756515503, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3368055820465088, "rewards/KeyDiagnosticEvidenceORM/std": 0.1525358110666275, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/mean_length": 831.1041870117188, "completions/min_length": 696.0, "entropy/max": 0.359375, "entropy/mean": 0.21337890625, "entropy/min": 0.13720703125, "epoch": 0.12373225152129817, "frac_reward_zero_std": 0.0, "grad_norm": 0.114821657538414, "learning_rate": 9.678584095202469e-07, "loss": 0.0020638927817344666, "reward": 1.673032522201538, "reward_std": 0.14037954807281494, "rewards/DiagnosisAccuracyORM/mean": 0.4288194477558136, "rewards/DiagnosisAccuracyORM/std": 0.2393040657043457, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24421297013759613, "rewards/KeyDiagnosticEvidenceORM/std": 0.12657703459262848, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/mean_length": 814.1458740234375, "completions/min_length": 620.0, "entropy/max": 0.3642578125, "entropy/mean": 0.23583984375, "entropy/min": 0.138671875, "epoch": 0.1257606490872211, "frac_reward_zero_std": 0.0, "grad_norm": 0.13473179936408997, "learning_rate": 9.667132682035645e-07, "loss": 0.0032253265380859375, "reward": 1.7241733074188232, "reward_std": 0.2060450315475464, "rewards/DiagnosisAccuracyORM/mean": 0.4834325313568115, "rewards/DiagnosisAccuracyORM/std": 0.20506560802459717, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24074076116085052, "rewards/KeyDiagnosticEvidenceORM/std": 0.10508689284324646, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/mean_length": 821.8958740234375, "completions/min_length": 712.0, "entropy/max": 0.3681640625, "entropy/mean": 0.21826171875, "entropy/min": 0.132568359375, "epoch": 0.12778904665314403, "frac_reward_zero_std": 0.0, "grad_norm": 0.11856493353843689, "learning_rate": 9.655487845586375e-07, "loss": -0.0031858559232205153, "reward": 1.55547297000885, "reward_std": 0.18411999940872192, "rewards/DiagnosisAccuracyORM/mean": 0.3442460298538208, "rewards/DiagnosisAccuracyORM/std": 0.29823818802833557, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.21122686564922333, "rewards/KeyDiagnosticEvidenceORM/std": 0.1291171908378601, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/mean_length": 842.5625, "completions/min_length": 645.0, "entropy/max": 0.3701171875, "entropy/mean": 0.24462890625, "entropy/min": 0.1455078125, "epoch": 0.12981744421906694, "frac_reward_zero_std": 0.0, "grad_norm": 0.14262160658836365, "learning_rate": 9.643650068459862e-07, "loss": -0.010190392844378948, "reward": 1.7898313999176025, "reward_std": 0.23417270183563232, "rewards/DiagnosisAccuracyORM/mean": 0.5311508178710938, "rewards/DiagnosisAccuracyORM/std": 0.3082343339920044, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2586805522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.131812185049057, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/mean_length": 806.6041870117188, "completions/min_length": 661.0, "entropy/max": 0.4521484375, "entropy/mean": 0.228515625, "entropy/min": 0.13720703125, "epoch": 0.13184584178498987, "frac_reward_zero_std": 0.0, "grad_norm": 0.15146926045417786, "learning_rate": 9.631619841257474e-07, "loss": -0.007546992506831884, "reward": 1.756134271621704, "reward_std": 0.1790562868118286, "rewards/DiagnosisAccuracyORM/mean": 0.4656250476837158, "rewards/DiagnosisAccuracyORM/std": 0.2886834442615509, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29050925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.10565168410539627, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/mean_length": 809.8958740234375, "completions/min_length": 676.0, "entropy/max": 0.2978515625, "entropy/mean": 0.21923828125, "entropy/min": 0.14990234375, "epoch": 0.13387423935091278, "frac_reward_zero_std": 0.0, "grad_norm": 0.11985605210065842, "learning_rate": 9.619397662556433e-07, "loss": 0.003061177907511592, "reward": 1.7903274297714233, "reward_std": 0.2262352854013443, "rewards/DiagnosisAccuracyORM/mean": 0.49866077303886414, "rewards/DiagnosisAccuracyORM/std": 0.33893394470214844, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2916666567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.10330117493867874, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/mean_length": 831.625, "completions/min_length": 672.0, "entropy/max": 0.32421875, "entropy/mean": 0.2177734375, "entropy/min": 0.14599609375, "epoch": 0.1359026369168357, "frac_reward_zero_std": 0.0, "grad_norm": 0.1382552683353424, "learning_rate": 9.60698403888914e-07, "loss": 0.014355001971125603, "reward": 1.7972222566604614, "reward_std": 0.2682862877845764, "rewards/DiagnosisAccuracyORM/mean": 0.5298611521720886, "rewards/DiagnosisAccuracyORM/std": 0.3118608593940735, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2673611342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.11061131209135056, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/mean_length": 831.1875, "completions/min_length": 639.0, "entropy/max": 0.4521484375, "entropy/mean": 0.24853515625, "entropy/min": 0.13916015625, "epoch": 0.13793103448275862, "frac_reward_zero_std": 0.0, "grad_norm": 0.150004044175148, "learning_rate": 9.594379484722184e-07, "loss": -0.005417183041572571, "reward": 1.7954118251800537, "reward_std": 0.204132080078125, "rewards/DiagnosisAccuracyORM/mean": 0.49332842230796814, "rewards/DiagnosisAccuracyORM/std": 0.3324432075023651, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3020833432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.14275068044662476, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/mean_length": 815.1666870117188, "completions/min_length": 681.0, "entropy/max": 0.30078125, "entropy/mean": 0.201171875, "entropy/min": 0.14306640625, "epoch": 0.13995943204868155, "frac_reward_zero_std": 0.0, "grad_norm": 0.1450711041688919, "learning_rate": 9.581584522435023e-07, "loss": 0.01174693088978529, "reward": 1.6944444179534912, "reward_std": 0.2836110591888428, "rewards/DiagnosisAccuracyORM/mean": 0.3715277910232544, "rewards/DiagnosisAccuracyORM/std": 0.30793216824531555, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3229166567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.15834473073482513, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/mean_length": 802.5833740234375, "completions/min_length": 646.0, "entropy/max": 0.439453125, "entropy/mean": 0.23583984375, "entropy/min": 0.15771484375, "epoch": 0.14198782961460446, "frac_reward_zero_std": 0.0, "grad_norm": 0.15052121877670288, "learning_rate": 9.568599682298334e-07, "loss": -0.0033463239669799805, "reward": 1.6644675731658936, "reward_std": 0.2490997612476349, "rewards/DiagnosisAccuracyORM/mean": 0.4121527671813965, "rewards/DiagnosisAccuracyORM/std": 0.25450411438941956, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25231483578681946, "rewards/KeyDiagnosticEvidenceORM/std": 0.13738349080085754, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/mean_length": 820.7083740234375, "completions/min_length": 627.0, "entropy/max": 0.4091796875, "entropy/mean": 0.22314453125, "entropy/min": 0.1376953125, "epoch": 0.1440162271805274, "frac_reward_zero_std": 0.0, "grad_norm": 0.15216268599033356, "learning_rate": 9.555425502452037e-07, "loss": -0.006539615802466869, "reward": 1.7668981552124023, "reward_std": 0.23932816088199615, "rewards/DiagnosisAccuracyORM/mean": 0.4642361104488373, "rewards/DiagnosisAccuracyORM/std": 0.3385756015777588, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3026620149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.13416793942451477, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/mean_length": 813.8125, "completions/min_length": 640.0, "entropy/max": 0.2998046875, "entropy/mean": 0.208984375, "entropy/min": 0.13330078125, "epoch": 0.1460446247464503, "frac_reward_zero_std": 0.0, "grad_norm": 0.10752397775650024, "learning_rate": 9.542062528882988e-07, "loss": -0.015982862561941147, "reward": 1.6311012506484985, "reward_std": 0.21697205305099487, "rewards/DiagnosisAccuracyORM/mean": 0.33596229553222656, "rewards/DiagnosisAccuracyORM/std": 0.2915518283843994, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2951388955116272, "rewards/KeyDiagnosticEvidenceORM/std": 0.08998748660087585, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 807.5208740234375, "completions/min_length": 696.0, "entropy/max": 0.2861328125, "entropy/mean": 0.19873046875, "entropy/min": 0.1376953125, "epoch": 0.14807302231237324, "frac_reward_zero_std": 0.0, "grad_norm": 0.14907850325107574, "learning_rate": 9.528511315402357e-07, "loss": -0.010460883378982544, "reward": 1.69703209400177, "reward_std": 0.1859007328748703, "rewards/DiagnosisAccuracyORM/mean": 0.43082842230796814, "rewards/DiagnosisAccuracyORM/std": 0.27209678292274475, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26620373129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.15596380829811096, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 820.9375, "completions/min_length": 652.0, "entropy/max": 0.365234375, "entropy/mean": 0.2138671875, "entropy/min": 0.13525390625, "epoch": 0.15010141987829614, "frac_reward_zero_std": 0.0, "grad_norm": 0.13077475130558014, "learning_rate": 9.514772423622675e-07, "loss": -0.005724040325731039, "reward": 1.9171297550201416, "reward_std": 0.2559543251991272, "rewards/DiagnosisAccuracyORM/mean": 0.5947916507720947, "rewards/DiagnosisAccuracyORM/std": 0.2761855125427246, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32233795523643494, "rewards/KeyDiagnosticEvidenceORM/std": 0.1286076009273529, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/mean_length": 803.0625, "completions/min_length": 580.0, "entropy/max": 0.271484375, "entropy/mean": 0.19775390625, "entropy/min": 0.14208984375, "epoch": 0.15212981744421908, "frac_reward_zero_std": 0.0, "grad_norm": 0.1116199940443039, "learning_rate": 9.500846422934555e-07, "loss": -0.004646075423806906, "reward": 1.9417990446090698, "reward_std": 0.20835119485855103, "rewards/DiagnosisAccuracyORM/mean": 0.526289701461792, "rewards/DiagnosisAccuracyORM/std": 0.2563733756542206, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.41550925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.17379914224147797, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/mean_length": 824.375, "completions/min_length": 678.0, "entropy/max": 0.4541015625, "entropy/mean": 0.25244140625, "entropy/min": 0.13427734375, "epoch": 0.15415821501014199, "frac_reward_zero_std": 0.0, "grad_norm": 0.152813121676445, "learning_rate": 9.486733890483099e-07, "loss": -0.0085414107888937, "reward": 1.6630291938781738, "reward_std": 0.18674838542938232, "rewards/DiagnosisAccuracyORM/mean": 0.4107142984867096, "rewards/DiagnosisAccuracyORM/std": 0.29603612422943115, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25231483578681946, "rewards/KeyDiagnosticEvidenceORM/std": 0.1230086088180542, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/mean_length": 847.5833740234375, "completions/min_length": 691.0, "entropy/max": 0.3515625, "entropy/mean": 0.23583984375, "entropy/min": 0.1474609375, "epoch": 0.15618661257606492, "frac_reward_zero_std": 0.0, "grad_norm": 0.12662728130817413, "learning_rate": 9.472435411143977e-07, "loss": 0.005145291797816753, "reward": 1.7033565044403076, "reward_std": 0.2685714364051819, "rewards/DiagnosisAccuracyORM/mean": 0.45451390743255615, "rewards/DiagnosisAccuracyORM/std": 0.37057459354400635, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24884259700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.15806801617145538, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/mean_length": 805.2708740234375, "completions/min_length": 672.0, "entropy/max": 0.421875, "entropy/mean": 0.22900390625, "entropy/min": 0.1376953125, "epoch": 0.15821501014198783, "frac_reward_zero_std": 0.0, "grad_norm": 0.13954439759254456, "learning_rate": 9.457951577499186e-07, "loss": 0.010954150930047035, "reward": 1.69050931930542, "reward_std": 0.2360486537218094, "rewards/DiagnosisAccuracyORM/mean": 0.3907985985279083, "rewards/DiagnosisAccuracyORM/std": 0.31365326046943665, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29971063137054443, "rewards/KeyDiagnosticEvidenceORM/std": 0.11317272484302521, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 821.8333740234375, "completions/min_length": 691.0, "entropy/max": 0.3994140625, "entropy/mean": 0.2265625, "entropy/min": 0.13671875, "epoch": 0.16024340770791076, "frac_reward_zero_std": 0.0, "grad_norm": 0.12138354778289795, "learning_rate": 9.443282989812493e-07, "loss": 0.02305983565747738, "reward": 1.7747024297714233, "reward_std": 0.24845775961875916, "rewards/DiagnosisAccuracyORM/mean": 0.47435513138771057, "rewards/DiagnosisAccuracyORM/std": 0.3089793622493744, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.300347238779068, "rewards/KeyDiagnosticEvidenceORM/std": 0.11987542361021042, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/mean_length": 788.0625, "completions/min_length": 631.0, "entropy/max": 0.26953125, "entropy/mean": 0.205078125, "entropy/min": 0.14208984375, "epoch": 0.16227180527383367, "frac_reward_zero_std": 0.0, "grad_norm": 0.11933927237987518, "learning_rate": 9.428430256004557e-07, "loss": -0.0034116930328309536, "reward": 1.88761568069458, "reward_std": 0.2423515021800995, "rewards/DiagnosisAccuracyORM/mean": 0.5687499642372131, "rewards/DiagnosisAccuracyORM/std": 0.3483772873878479, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31886574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.1342596858739853, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/mean_length": 819.6875, "completions/min_length": 684.0, "entropy/max": 0.4404296875, "entropy/mean": 0.2353515625, "entropy/min": 0.14697265625, "epoch": 0.1643002028397566, "frac_reward_zero_std": 0.0, "grad_norm": 0.14897024631500244, "learning_rate": 9.413393991627736e-07, "loss": -0.012881237082183361, "reward": 1.7197916507720947, "reward_std": 0.21921217441558838, "rewards/DiagnosisAccuracyORM/mean": 0.4090277850627899, "rewards/DiagnosisAccuracyORM/std": 0.25069543719291687, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3107638657093048, "rewards/KeyDiagnosticEvidenceORM/std": 0.13914352655410767, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/mean_length": 816.7291870117188, "completions/min_length": 681.0, "entropy/max": 0.3154296875, "entropy/mean": 0.21630859375, "entropy/min": 0.1435546875, "epoch": 0.1663286004056795, "frac_reward_zero_std": 0.0, "grad_norm": 0.123128242790699, "learning_rate": 9.398174819840577e-07, "loss": 0.0027243546210229397, "reward": 1.7590444087982178, "reward_std": 0.24663956463336945, "rewards/DiagnosisAccuracyORM/mean": 0.45985451340675354, "rewards/DiagnosisAccuracyORM/std": 0.2902185320854187, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29918980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.13635268807411194, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/mean_length": 833.7083740234375, "completions/min_length": 645.0, "entropy/max": 0.314453125, "entropy/mean": 0.2138671875, "entropy/min": 0.15869140625, "epoch": 0.16835699797160245, "frac_reward_zero_std": 0.0, "grad_norm": 0.12444489449262619, "learning_rate": 9.382773371381984e-07, "loss": 0.0074301064014434814, "reward": 1.9092262983322144, "reward_std": 0.2743685841560364, "rewards/DiagnosisAccuracyORM/mean": 0.6198743581771851, "rewards/DiagnosisAccuracyORM/std": 0.3280838131904602, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28935185074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.11684852093458176, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1139.0, "completions/mean_length": 821.6666870117188, "completions/min_length": 674.0, "entropy/max": 0.3564453125, "entropy/mean": 0.205078125, "entropy/min": 0.1279296875, "epoch": 0.17038539553752535, "frac_reward_zero_std": 0.0, "grad_norm": 0.10591732710599899, "learning_rate": 9.367190284545085e-07, "loss": 0.027501672506332397, "reward": 1.9086805582046509, "reward_std": 0.2553911805152893, "rewards/DiagnosisAccuracyORM/mean": 0.5770833492279053, "rewards/DiagnosisAccuracyORM/std": 0.35972845554351807, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.331597238779068, "rewards/KeyDiagnosticEvidenceORM/std": 0.13934984803199768, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/mean_length": 814.6458740234375, "completions/min_length": 642.0, "entropy/max": 0.296875, "entropy/mean": 0.2060546875, "entropy/min": 0.15966796875, "epoch": 0.1724137931034483, "frac_reward_zero_std": 0.0, "grad_norm": 0.1296047866344452, "learning_rate": 9.351426205150776e-07, "loss": -0.008044074289500713, "reward": 1.7426421642303467, "reward_std": 0.25099673867225647, "rewards/DiagnosisAccuracyORM/mean": 0.4208829402923584, "rewards/DiagnosisAccuracyORM/std": 0.3271651268005371, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32175925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.12591056525707245, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/mean_length": 831.5, "completions/min_length": 655.0, "entropy/max": 0.427734375, "entropy/mean": 0.232421875, "entropy/min": 0.146484375, "epoch": 0.1744421906693712, "frac_reward_zero_std": 0.0, "grad_norm": 0.12450467050075531, "learning_rate": 9.335481786520953e-07, "loss": -0.01633710414171219, "reward": 1.829249382019043, "reward_std": 0.23108692467212677, "rewards/DiagnosisAccuracyORM/mean": 0.5439484119415283, "rewards/DiagnosisAccuracyORM/std": 0.30184054374694824, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28530094027519226, "rewards/KeyDiagnosticEvidenceORM/std": 0.1516847461462021, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/mean_length": 820.0416870117188, "completions/min_length": 657.0, "entropy/max": 0.4140625, "entropy/mean": 0.2216796875, "entropy/min": 0.1337890625, "epoch": 0.17647058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 0.22955887019634247, "learning_rate": 9.319357689451442e-07, "loss": -0.010537244379520416, "reward": 1.6116650104522705, "reward_std": 0.18346759676933289, "rewards/DiagnosisAccuracyORM/mean": 0.32462796568870544, "rewards/DiagnosisAccuracyORM/std": 0.21649612486362457, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2870370149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.12492883950471878, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/mean_length": 818.2916870117188, "completions/min_length": 682.0, "entropy/max": 0.3623046875, "entropy/mean": 0.22802734375, "entropy/min": 0.15087890625, "epoch": 0.17849898580121704, "frac_reward_zero_std": 0.0, "grad_norm": 0.1347259283065796, "learning_rate": 9.303054582184608e-07, "loss": 0.005469841416925192, "reward": 1.7633929252624512, "reward_std": 0.25071120262145996, "rewards/DiagnosisAccuracyORM/mean": 0.4312169551849365, "rewards/DiagnosisAccuracyORM/std": 0.2808866500854492, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3321759104728699, "rewards/KeyDiagnosticEvidenceORM/std": 0.14438968896865845, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/mean_length": 811.875, "completions/min_length": 648.0, "entropy/max": 0.412109375, "entropy/mean": 0.22119140625, "entropy/min": 0.15576171875, "epoch": 0.18052738336713997, "frac_reward_zero_std": 0.0, "grad_norm": 0.12709927558898926, "learning_rate": 9.286573140381662e-07, "loss": -0.007428791373968124, "reward": 1.758763313293457, "reward_std": 0.20948579907417297, "rewards/DiagnosisAccuracyORM/mean": 0.44799932837486267, "rewards/DiagnosisAccuracyORM/std": 0.3385353088378906, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3107638657093048, "rewards/KeyDiagnosticEvidenceORM/std": 0.13748179376125336, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/mean_length": 815.2291870117188, "completions/min_length": 613.0, "entropy/max": 0.267578125, "entropy/mean": 0.20263671875, "entropy/min": 0.162109375, "epoch": 0.18255578093306288, "frac_reward_zero_std": 0.0, "grad_norm": 0.13259252905845642, "learning_rate": 9.269914047094658e-07, "loss": 0.007999598048627377, "reward": 1.850578784942627, "reward_std": 0.21016284823417664, "rewards/DiagnosisAccuracyORM/mean": 0.5774305462837219, "rewards/DiagnosisAccuracyORM/std": 0.2915574908256531, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27314814925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.12051425129175186, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/mean_length": 824.0416870117188, "completions/min_length": 680.0, "entropy/max": 0.3251953125, "entropy/mean": 0.22900390625, "entropy/min": 0.15283203125, "epoch": 0.1845841784989858, "frac_reward_zero_std": 0.0, "grad_norm": 0.11995429545640945, "learning_rate": 9.253077992738192e-07, "loss": 0.007536890916526318, "reward": 1.830373764038086, "reward_std": 0.2935844659805298, "rewards/DiagnosisAccuracyORM/mean": 0.5479663014411926, "rewards/DiagnosisAccuracyORM/std": 0.30675795674324036, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2824074327945709, "rewards/KeyDiagnosticEvidenceORM/std": 0.1682678461074829, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/mean_length": 832.1458740234375, "completions/min_length": 656.0, "entropy/max": 0.3583984375, "entropy/mean": 0.236328125, "entropy/min": 0.14599609375, "epoch": 0.18661257606490872, "frac_reward_zero_std": 0.0, "grad_norm": 0.11316418647766113, "learning_rate": 9.236065675060773e-07, "loss": -0.0011370530119165778, "reward": 1.8680555820465088, "reward_std": 0.27936673164367676, "rewards/DiagnosisAccuracyORM/mean": 0.5555555820465088, "rewards/DiagnosisAccuracyORM/std": 0.3144910931587219, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3125, "rewards/KeyDiagnosticEvidenceORM/std": 0.1492583006620407, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/mean_length": 825.7083740234375, "completions/min_length": 697.0, "entropy/max": 0.4501953125, "entropy/mean": 0.212890625, "entropy/min": 0.1435546875, "epoch": 0.18864097363083165, "frac_reward_zero_std": 0.0, "grad_norm": 0.13565756380558014, "learning_rate": 9.218877799115927e-07, "loss": -0.009371365420520306, "reward": 1.862037181854248, "reward_std": 0.1983056515455246, "rewards/DiagnosisAccuracyORM/mean": 0.559374988079071, "rewards/DiagnosisAccuracyORM/std": 0.31583330035209656, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30266204476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.12118076533079147, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/mean_length": 830.0208740234375, "completions/min_length": 661.0, "entropy/max": 0.2919921875, "entropy/mean": 0.212890625, "entropy/min": 0.1650390625, "epoch": 0.19066937119675456, "frac_reward_zero_std": 0.0, "grad_norm": 0.1247071623802185, "learning_rate": 9.201515077232958e-07, "loss": -0.02097148820757866, "reward": 1.7376158237457275, "reward_std": 0.2090865969657898, "rewards/DiagnosisAccuracyORM/mean": 0.40138888359069824, "rewards/DiagnosisAccuracyORM/std": 0.27164748311042786, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33622685074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.11217568814754486, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/mean_length": 809.8958740234375, "completions/min_length": 668.0, "entropy/max": 0.2548828125, "entropy/mean": 0.193359375, "entropy/min": 0.14794921875, "epoch": 0.1926977687626775, "frac_reward_zero_std": 0.0, "grad_norm": 0.11392989754676819, "learning_rate": 9.183978228987435e-07, "loss": 0.0050832852721214294, "reward": 1.8074074983596802, "reward_std": 0.2326326221227646, "rewards/DiagnosisAccuracyORM/mean": 0.42951393127441406, "rewards/DiagnosisAccuracyORM/std": 0.27865856885910034, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3778935372829437, "rewards/KeyDiagnosticEvidenceORM/std": 0.169667586684227, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/mean_length": 815.9583740234375, "completions/min_length": 656.0, "entropy/max": 0.34765625, "entropy/mean": 0.21337890625, "entropy/min": 0.134765625, "epoch": 0.1947261663286004, "frac_reward_zero_std": 0.0, "grad_norm": 0.12328945100307465, "learning_rate": 9.166267981171369e-07, "loss": 0.0004337777791079134, "reward": 1.8058366775512695, "reward_std": 0.23451656103134155, "rewards/DiagnosisAccuracyORM/mean": 0.4562995731830597, "rewards/DiagnosisAccuracyORM/std": 0.2736385464668274, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34953704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.12393933534622192, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 829.75, "completions/min_length": 626.0, "entropy/max": 0.474609375, "entropy/mean": 0.263671875, "entropy/min": 0.14599609375, "epoch": 0.19675456389452334, "frac_reward_zero_std": 0.0, "grad_norm": 0.1404096782207489, "learning_rate": 9.148385067763093e-07, "loss": 0.015630226582288742, "reward": 1.7514965534210205, "reward_std": 0.22404052317142487, "rewards/DiagnosisAccuracyORM/mean": 0.4737185537815094, "rewards/DiagnosisAccuracyORM/std": 0.28386107087135315, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2777777910232544, "rewards/KeyDiagnosticEvidenceORM/std": 0.16676513850688934, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/mean_length": 815.8541870117188, "completions/min_length": 690.0, "entropy/max": 0.310546875, "entropy/mean": 0.19970703125, "entropy/min": 0.14306640625, "epoch": 0.19878296146044624, "frac_reward_zero_std": 0.0, "grad_norm": 0.11627918481826782, "learning_rate": 9.130330229896845e-07, "loss": -0.01011887937784195, "reward": 1.655489444732666, "reward_std": 0.1951185017824173, "rewards/DiagnosisAccuracyORM/mean": 0.38754963874816895, "rewards/DiagnosisAccuracyORM/std": 0.3020539879798889, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26793980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.1314796358346939, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/mean_length": 815.1875, "completions/min_length": 548.0, "entropy/max": 0.4248046875, "entropy/mean": 0.22314453125, "entropy/min": 0.142578125, "epoch": 0.20081135902636918, "frac_reward_zero_std": 0.0, "grad_norm": 0.11793100833892822, "learning_rate": 9.112104215832046e-07, "loss": -0.0025345832109451294, "reward": 1.815178632736206, "reward_std": 0.2783719003200531, "rewards/DiagnosisAccuracyORM/mean": 0.47374340891838074, "rewards/DiagnosisAccuracyORM/std": 0.318571001291275, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34143519401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.10611682385206223, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/mean_length": 820.8333740234375, "completions/min_length": 682.0, "entropy/max": 0.2861328125, "entropy/mean": 0.21435546875, "entropy/min": 0.1376953125, "epoch": 0.2028397565922921, "frac_reward_zero_std": 0.0, "grad_norm": 0.1460329294204712, "learning_rate": 9.093707780922293e-07, "loss": -0.005916997324675322, "reward": 1.755753993988037, "reward_std": 0.24025841057300568, "rewards/DiagnosisAccuracyORM/mean": 0.4432539939880371, "rewards/DiagnosisAccuracyORM/std": 0.24802707135677338, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3125, "rewards/KeyDiagnosticEvidenceORM/std": 0.12256292998790741, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/mean_length": 805.1666870117188, "completions/min_length": 683.0, "entropy/max": 0.349609375, "entropy/mean": 0.19970703125, "entropy/min": 0.12841796875, "epoch": 0.20486815415821502, "frac_reward_zero_std": 0.0, "grad_norm": 0.12184421718120575, "learning_rate": 9.075141687584056e-07, "loss": 0.010923661291599274, "reward": 1.8226191997528076, "reward_std": 0.180604487657547, "rewards/DiagnosisAccuracyORM/mean": 0.5187996029853821, "rewards/DiagnosisAccuracyORM/std": 0.20075558125972748, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3038194477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.11874019354581833, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/mean_length": 818.6041870117188, "completions/min_length": 692.0, "entropy/max": 0.4189453125, "entropy/mean": 0.22412109375, "entropy/min": 0.14697265625, "epoch": 0.20689655172413793, "frac_reward_zero_std": 0.0, "grad_norm": 0.13748793303966522, "learning_rate": 9.056406705265083e-07, "loss": -0.01827327162027359, "reward": 1.609953761100769, "reward_std": 0.2584631145000458, "rewards/DiagnosisAccuracyORM/mean": 0.3552083671092987, "rewards/DiagnosisAccuracyORM/std": 0.41235578060150146, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25474536418914795, "rewards/KeyDiagnosticEvidenceORM/std": 0.149771049618721, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/mean_length": 813.0416870117188, "completions/min_length": 688.0, "entropy/max": 0.3203125, "entropy/mean": 0.19921875, "entropy/min": 0.13916015625, "epoch": 0.20892494929006086, "frac_reward_zero_std": 0.0, "grad_norm": 0.12008258700370789, "learning_rate": 9.0375036104125e-07, "loss": 0.017172526568174362, "reward": 1.7266204357147217, "reward_std": 0.246208056807518, "rewards/DiagnosisAccuracyORM/mean": 0.38229164481163025, "rewards/DiagnosisAccuracyORM/std": 0.2641594409942627, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34432873129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.19756360352039337, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/mean_length": 806.9375, "completions/min_length": 663.0, "entropy/max": 0.345703125, "entropy/mean": 0.2216796875, "entropy/min": 0.1376953125, "epoch": 0.21095334685598377, "frac_reward_zero_std": 0.0, "grad_norm": 0.1220623105764389, "learning_rate": 9.018433186440646e-07, "loss": -0.003397380467504263, "reward": 1.7452877759933472, "reward_std": 0.22028633952140808, "rewards/DiagnosisAccuracyORM/mean": 0.4032738208770752, "rewards/DiagnosisAccuracyORM/std": 0.27383115887641907, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3420139253139496, "rewards/KeyDiagnosticEvidenceORM/std": 0.11024429649114609, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/mean_length": 826.8333740234375, "completions/min_length": 652.0, "entropy/max": 0.4287109375, "entropy/mean": 0.2587890625, "entropy/min": 0.141357421875, "epoch": 0.2129817444219067, "frac_reward_zero_std": 0.0, "grad_norm": 0.12845934927463531, "learning_rate": 8.999196223698598e-07, "loss": 0.013206507079303265, "reward": 1.7292823791503906, "reward_std": 0.18443799018859863, "rewards/DiagnosisAccuracyORM/mean": 0.4711805582046509, "rewards/DiagnosisAccuracyORM/std": 0.35553276538848877, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25810185074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.11080904304981232, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/mean_length": 830.25, "completions/min_length": 653.0, "entropy/max": 0.4716796875, "entropy/mean": 0.24365234375, "entropy/min": 0.1416015625, "epoch": 0.2150101419878296, "frac_reward_zero_std": 0.0, "grad_norm": 0.12864810228347778, "learning_rate": 8.979793519437411e-07, "loss": 0.002501758513972163, "reward": 1.759143590927124, "reward_std": 0.29781365394592285, "rewards/DiagnosisAccuracyORM/mean": 0.5128472447395325, "rewards/DiagnosisAccuracyORM/std": 0.3470449149608612, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24629628658294678, "rewards/KeyDiagnosticEvidenceORM/std": 0.13859938085079193, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/mean_length": 838.125, "completions/min_length": 684.0, "entropy/max": 0.3876953125, "entropy/mean": 0.2294921875, "entropy/min": 0.13525390625, "epoch": 0.21703853955375255, "frac_reward_zero_std": 0.0, "grad_norm": 0.11914031952619553, "learning_rate": 8.960225877777094e-07, "loss": -0.0010551934828981757, "reward": 1.7385914325714111, "reward_std": 0.2516627907752991, "rewards/DiagnosisAccuracyORM/mean": 0.4017857313156128, "rewards/DiagnosisAccuracyORM/std": 0.3064654767513275, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3368055820465088, "rewards/KeyDiagnosticEvidenceORM/std": 0.16091585159301758, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 819.1458740234375, "completions/min_length": 664.0, "entropy/max": 0.234375, "entropy/mean": 0.1953125, "entropy/min": 0.16015625, "epoch": 0.21906693711967545, "frac_reward_zero_std": 0.0, "grad_norm": 0.10218513011932373, "learning_rate": 8.940494109673265e-07, "loss": 0.007096981164067984, "reward": 1.8415510654449463, "reward_std": 0.18910610675811768, "rewards/DiagnosisAccuracyORM/mean": 0.5128472447395325, "rewards/DiagnosisAccuracyORM/std": 0.29621806740760803, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32870370149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.12881088256835938, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/mean_length": 836.7916870117188, "completions/min_length": 571.0, "entropy/max": 0.4228515625, "entropy/mean": 0.22900390625, "entropy/min": 0.16015625, "epoch": 0.2210953346855984, "frac_reward_zero_std": 0.0, "grad_norm": 0.133299320936203, "learning_rate": 8.920599032883552e-07, "loss": -0.0032470624428242445, "reward": 1.7840278148651123, "reward_std": 0.22373968362808228, "rewards/DiagnosisAccuracyORM/mean": 0.5166666507720947, "rewards/DiagnosisAccuracyORM/std": 0.2793952524662018, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2673611044883728, "rewards/KeyDiagnosticEvidenceORM/std": 0.12601375579833984, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/mean_length": 815.0833740234375, "completions/min_length": 646.0, "entropy/max": 0.3642578125, "entropy/mean": 0.232421875, "entropy/min": 0.14111328125, "epoch": 0.2231237322515213, "frac_reward_zero_std": 0.0, "grad_norm": 0.14506608247756958, "learning_rate": 8.900541471933703e-07, "loss": -0.006624435540288687, "reward": 1.7131614685058594, "reward_std": 0.24181126058101654, "rewards/DiagnosisAccuracyORM/mean": 0.4168650805950165, "rewards/DiagnosisAccuracyORM/std": 0.3015218675136566, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2962963283061981, "rewards/KeyDiagnosticEvidenceORM/std": 0.12254060059785843, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/mean_length": 821.9166870117188, "completions/min_length": 679.0, "entropy/max": 0.314453125, "entropy/mean": 0.22314453125, "entropy/min": 0.1455078125, "epoch": 0.22515212981744423, "frac_reward_zero_std": 0.0, "grad_norm": 0.12612195312976837, "learning_rate": 8.880322258083407e-07, "loss": 0.0002567693591117859, "reward": 1.7483134269714355, "reward_std": 0.17806386947631836, "rewards/DiagnosisAccuracyORM/mean": 0.4722718298435211, "rewards/DiagnosisAccuracyORM/std": 0.3096693158149719, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2760416567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.13265764713287354, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/mean_length": 814.0625, "completions/min_length": 631.0, "entropy/max": 0.2958984375, "entropy/mean": 0.21337890625, "entropy/min": 0.1591796875, "epoch": 0.22718052738336714, "frac_reward_zero_std": 0.0, "grad_norm": 0.145867258310318, "learning_rate": 8.859942229291855e-07, "loss": 0.010520076379179955, "reward": 1.7811343669891357, "reward_std": 0.16421842575073242, "rewards/DiagnosisAccuracyORM/mean": 0.4836805760860443, "rewards/DiagnosisAccuracyORM/std": 0.1719890981912613, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29745370149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.11531047523021698, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/mean_length": 818.0, "completions/min_length": 663.0, "entropy/max": 0.318359375, "entropy/mean": 0.21435546875, "entropy/min": 0.14990234375, "epoch": 0.22920892494929007, "frac_reward_zero_std": 0.0, "grad_norm": 0.1277884840965271, "learning_rate": 8.839402230183e-07, "loss": 0.010278573259711266, "reward": 1.7410550117492676, "reward_std": 0.2650969326496124, "rewards/DiagnosisAccuracyORM/mean": 0.46964287757873535, "rewards/DiagnosisAccuracyORM/std": 0.2838849425315857, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27141204476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.11144152283668518, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/mean_length": 812.3958740234375, "completions/min_length": 687.0, "entropy/max": 0.4208984375, "entropy/mean": 0.2265625, "entropy/min": 0.14697265625, "epoch": 0.23123732251521298, "frac_reward_zero_std": 0.0, "grad_norm": 0.11305257678031921, "learning_rate": 8.818703112010561e-07, "loss": -0.001402196823619306, "reward": 1.6554232835769653, "reward_std": 0.19099557399749756, "rewards/DiagnosisAccuracyORM/mean": 0.4077381193637848, "rewards/DiagnosisAccuracyORM/std": 0.297881543636322, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24768519401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.17121773958206177, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/mean_length": 836.8125, "completions/min_length": 697.0, "entropy/max": 0.3349609375, "entropy/mean": 0.224609375, "entropy/min": 0.169921875, "epoch": 0.2332657200811359, "frac_reward_zero_std": 0.0, "grad_norm": 0.12365220487117767, "learning_rate": 8.797845732622742e-07, "loss": -0.004092162009328604, "reward": 1.6349537372589111, "reward_std": 0.22493012249469757, "rewards/DiagnosisAccuracyORM/mean": 0.331597238779068, "rewards/DiagnosisAccuracyORM/std": 0.32323789596557617, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30335649847984314, "rewards/KeyDiagnosticEvidenceORM/std": 0.10866539925336838, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/mean_length": 799.8958740234375, "completions/min_length": 655.0, "entropy/max": 0.3154296875, "entropy/mean": 0.2060546875, "entropy/min": 0.154296875, "epoch": 0.23529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.1241438165307045, "learning_rate": 8.776830956426673e-07, "loss": 0.025320960208773613, "reward": 1.7564815282821655, "reward_std": 0.2312103807926178, "rewards/DiagnosisAccuracyORM/mean": 0.4711805284023285, "rewards/DiagnosisAccuracyORM/std": 0.3053838908672333, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2853009104728699, "rewards/KeyDiagnosticEvidenceORM/std": 0.09270395338535309, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/mean_length": 810.0, "completions/min_length": 671.0, "entropy/max": 0.3212890625, "entropy/mean": 0.22705078125, "entropy/min": 0.17041015625, "epoch": 0.23732251521298176, "frac_reward_zero_std": 0.0, "grad_norm": 0.1452302634716034, "learning_rate": 8.755659654352599e-07, "loss": -0.003495862241834402, "reward": 1.7670139074325562, "reward_std": 0.1816679835319519, "rewards/DiagnosisAccuracyORM/mean": 0.48229166865348816, "rewards/DiagnosisAccuracyORM/std": 0.1918371617794037, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2847222089767456, "rewards/KeyDiagnosticEvidenceORM/std": 0.10480008274316788, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/mean_length": 809.4375, "completions/min_length": 605.0, "entropy/max": 0.3125, "entropy/mean": 0.205078125, "entropy/min": 0.142578125, "epoch": 0.23935091277890466, "frac_reward_zero_std": 0.0, "grad_norm": 0.12478862702846527, "learning_rate": 8.734332703817771e-07, "loss": 0.0015888673951849341, "reward": 1.9228010177612305, "reward_std": 0.23535394668579102, "rewards/DiagnosisAccuracyORM/mean": 0.6010416150093079, "rewards/DiagnosisAccuracyORM/std": 0.2941190004348755, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32175925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.10500874370336533, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 823.0833740234375, "completions/min_length": 689.0, "entropy/max": 0.2880859375, "entropy/mean": 0.20458984375, "entropy/min": 0.13671875, "epoch": 0.2413793103448276, "frac_reward_zero_std": 0.0, "grad_norm": 0.13292908668518066, "learning_rate": 8.712850988690093e-07, "loss": -0.0018789967289194465, "reward": 1.6456515789031982, "reward_std": 0.23246219754219055, "rewards/DiagnosisAccuracyORM/mean": 0.41359126567840576, "rewards/DiagnosisAccuracyORM/std": 0.2548013925552368, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.23206019401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.1298462450504303, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/mean_length": 830.3958740234375, "completions/min_length": 698.0, "entropy/max": 0.3232421875, "entropy/mean": 0.21484375, "entropy/min": 0.13134765625, "epoch": 0.2434077079107505, "frac_reward_zero_std": 0.0, "grad_norm": 0.11858820170164108, "learning_rate": 8.691215399251487e-07, "loss": -0.0046686953864991665, "reward": 1.6990079879760742, "reward_std": 0.21970191597938538, "rewards/DiagnosisAccuracyORM/mean": 0.38650795817375183, "rewards/DiagnosisAccuracyORM/std": 0.2299085557460785, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3125, "rewards/KeyDiagnosticEvidenceORM/std": 0.11971697211265564, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/mean_length": 809.0416870117188, "completions/min_length": 645.0, "entropy/max": 0.30859375, "entropy/mean": 0.201171875, "entropy/min": 0.138671875, "epoch": 0.24543610547667344, "frac_reward_zero_std": 0.0, "grad_norm": 0.12301164865493774, "learning_rate": 8.669426832160995e-07, "loss": 0.009763977490365505, "reward": 1.9319610595703125, "reward_std": 0.26451969146728516, "rewards/DiagnosisAccuracyORM/mean": 0.5922619104385376, "rewards/DiagnosisAccuracyORM/std": 0.24026493728160858, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33969905972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.15130546689033508, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/mean_length": 835.6458740234375, "completions/min_length": 654.0, "entropy/max": 0.4833984375, "entropy/mean": 0.291015625, "entropy/min": 0.18359375, "epoch": 0.24746450304259635, "frac_reward_zero_std": 0.0, "grad_norm": 0.1371147781610489, "learning_rate": 8.647486190417624e-07, "loss": -0.02835744619369507, "reward": 1.5317625999450684, "reward_std": 0.18222954869270325, "rewards/DiagnosisAccuracyORM/mean": 0.28581348061561584, "rewards/DiagnosisAccuracyORM/std": 0.2184305638074875, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24594907462596893, "rewards/KeyDiagnosticEvidenceORM/std": 0.15235967934131622, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/mean_length": 814.2083740234375, "completions/min_length": 679.0, "entropy/max": 0.337890625, "entropy/mean": 0.21533203125, "entropy/min": 0.1455078125, "epoch": 0.24949290060851928, "frac_reward_zero_std": 0.0, "grad_norm": 0.1271534413099289, "learning_rate": 8.625394383322914e-07, "loss": 0.008206671103835106, "reward": 1.6536046266555786, "reward_std": 0.1799350380897522, "rewards/DiagnosisAccuracyORM/mean": 0.3613591194152832, "rewards/DiagnosisAccuracyORM/std": 0.20290517807006836, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29224538803100586, "rewards/KeyDiagnosticEvidenceORM/std": 0.19733494520187378, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/mean_length": 811.4583740234375, "completions/min_length": 599.0, "entropy/max": 0.3857421875, "entropy/mean": 0.2119140625, "entropy/min": 0.13525390625, "epoch": 0.2515212981744422, "frac_reward_zero_std": 0.0, "grad_norm": 0.13038672506809235, "learning_rate": 8.60315232644326e-07, "loss": 0.005760247819125652, "reward": 1.7325644493103027, "reward_std": 0.1813516616821289, "rewards/DiagnosisAccuracyORM/mean": 0.4235366880893707, "rewards/DiagnosisAccuracyORM/std": 0.27920040488243103, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3090277910232544, "rewards/KeyDiagnosticEvidenceORM/std": 0.15232039988040924, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 825.75, "completions/min_length": 683.0, "entropy/max": 0.326171875, "entropy/mean": 0.20263671875, "entropy/min": 0.13232421875, "epoch": 0.2535496957403651, "frac_reward_zero_std": 0.0, "grad_norm": 0.13286292552947998, "learning_rate": 8.580760941571966e-07, "loss": -0.006303255911916494, "reward": 1.71875, "reward_std": 0.16434067487716675, "rewards/DiagnosisAccuracyORM/mean": 0.4618055820465088, "rewards/DiagnosisAccuracyORM/std": 0.32538318634033203, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2569444477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.11424367129802704, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/mean_length": 812.6875, "completions/min_length": 669.0, "entropy/max": 0.271484375, "entropy/mean": 0.203125, "entropy/min": 0.15185546875, "epoch": 0.25557809330628806, "frac_reward_zero_std": 0.0, "grad_norm": 0.11273028701543808, "learning_rate": 8.55822115669104e-07, "loss": -0.009039806202054024, "reward": 1.8316552639007568, "reward_std": 0.2830929458141327, "rewards/DiagnosisAccuracyORM/mean": 0.48993054032325745, "rewards/DiagnosisAccuracyORM/std": 0.27011817693710327, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3417245149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.12904664874076843, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/mean_length": 817.9583740234375, "completions/min_length": 639.0, "entropy/max": 0.3251953125, "entropy/mean": 0.22314453125, "entropy/min": 0.1337890625, "epoch": 0.25760649087221094, "frac_reward_zero_std": 0.0, "grad_norm": 0.13034573197364807, "learning_rate": 8.535533905932737e-07, "loss": -0.002482369542121887, "reward": 1.8474537134170532, "reward_std": 0.162545308470726, "rewards/DiagnosisAccuracyORM/mean": 0.5899305939674377, "rewards/DiagnosisAccuracyORM/std": 0.23325073719024658, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25752314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.10654623806476593, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/mean_length": 816.75, "completions/min_length": 651.0, "entropy/max": 0.365234375, "entropy/mean": 0.2275390625, "entropy/min": 0.16455078125, "epoch": 0.25963488843813387, "frac_reward_zero_std": 0.0, "grad_norm": 0.1248960942029953, "learning_rate": 8.512700129540846e-07, "loss": 0.0021280627697706223, "reward": 1.819179892539978, "reward_std": 0.19520017504692078, "rewards/DiagnosisAccuracyORM/mean": 0.47311508655548096, "rewards/DiagnosisAccuracyORM/std": 0.24483174085617065, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34606480598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.16676104068756104, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/mean_length": 812.375, "completions/min_length": 591.0, "entropy/max": 0.3740234375, "entropy/mean": 0.21435546875, "entropy/min": 0.126220703125, "epoch": 0.2616632860040568, "frac_reward_zero_std": 0.0, "grad_norm": 0.14280135929584503, "learning_rate": 8.489720773831716e-07, "loss": 0.004902114626020193, "reward": 1.781266689300537, "reward_std": 0.21518315374851227, "rewards/DiagnosisAccuracyORM/mean": 0.49017858505249023, "rewards/DiagnosisAccuracyORM/std": 0.2743552625179291, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2910879850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.13253672420978546, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/mean_length": 803.2083740234375, "completions/min_length": 673.0, "entropy/max": 0.3173828125, "entropy/mean": 0.19189453125, "entropy/min": 0.115966796875, "epoch": 0.26369168356997974, "frac_reward_zero_std": 0.0, "grad_norm": 0.11524657160043716, "learning_rate": 8.466596791155054e-07, "loss": 0.005301987286657095, "reward": 1.636772632598877, "reward_std": 0.2138466238975525, "rewards/DiagnosisAccuracyORM/mean": 0.3387400805950165, "rewards/DiagnosisAccuracyORM/std": 0.29921647906303406, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29803240299224854, "rewards/KeyDiagnosticEvidenceORM/std": 0.1279677301645279, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/mean_length": 817.9583740234375, "completions/min_length": 672.0, "entropy/max": 0.3564453125, "entropy/mean": 0.2265625, "entropy/min": 0.14794921875, "epoch": 0.2657200811359026, "frac_reward_zero_std": 0.0, "grad_norm": 0.13525544106960297, "learning_rate": 8.443329139854433e-07, "loss": -0.021195529028773308, "reward": 1.5911872386932373, "reward_std": 0.1444985270500183, "rewards/DiagnosisAccuracyORM/mean": 0.3035714328289032, "rewards/DiagnosisAccuracyORM/std": 0.2383478879928589, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28761574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.10742856562137604, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/mean_length": 815.9375, "completions/min_length": 623.0, "entropy/max": 0.2822265625, "entropy/mean": 0.20361328125, "entropy/min": 0.1337890625, "epoch": 0.26774847870182555, "frac_reward_zero_std": 0.0, "grad_norm": 0.11663325130939484, "learning_rate": 8.419918784227591e-07, "loss": 0.007955777458846569, "reward": 1.8596892356872559, "reward_std": 0.2442464828491211, "rewards/DiagnosisAccuracyORM/mean": 0.5165178775787354, "rewards/DiagnosisAccuracyORM/std": 0.3006595969200134, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3431713283061981, "rewards/KeyDiagnosticEvidenceORM/std": 0.12832005321979523, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/mean_length": 810.7916870117188, "completions/min_length": 616.0, "entropy/max": 0.400390625, "entropy/mean": 0.22607421875, "entropy/min": 0.14404296875, "epoch": 0.2697768762677485, "frac_reward_zero_std": 0.0, "grad_norm": 0.1368180215358734, "learning_rate": 8.396366694486466e-07, "loss": 0.009174264036118984, "reward": 1.5724455118179321, "reward_std": 0.21857908368110657, "rewards/DiagnosisAccuracyORM/mean": 0.28425100445747375, "rewards/DiagnosisAccuracyORM/std": 0.2146962583065033, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2881944477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.12718087434768677, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/mean_length": 809.9583740234375, "completions/min_length": 648.0, "entropy/max": 0.28125, "entropy/mean": 0.18896484375, "entropy/min": 0.1298828125, "epoch": 0.2718052738336714, "frac_reward_zero_std": 0.0, "grad_norm": 0.11866018176078796, "learning_rate": 8.372673846716975e-07, "loss": -0.01147710345685482, "reward": 1.8464946746826172, "reward_std": 0.1918104737997055, "rewards/DiagnosisAccuracyORM/mean": 0.5015873312950134, "rewards/DiagnosisAccuracyORM/std": 0.32731035351753235, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34490740299224854, "rewards/KeyDiagnosticEvidenceORM/std": 0.11712917685508728, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/mean_length": 809.8125, "completions/min_length": 657.0, "entropy/max": 0.24365234375, "entropy/mean": 0.20068359375, "entropy/min": 0.1494140625, "epoch": 0.2738336713995943, "frac_reward_zero_std": 0.0, "grad_norm": 0.12989524006843567, "learning_rate": 8.348841222838578e-07, "loss": 0.0097354082390666, "reward": 1.8135913610458374, "reward_std": 0.1608535349369049, "rewards/DiagnosisAccuracyORM/mean": 0.48546627163887024, "rewards/DiagnosisAccuracyORM/std": 0.17220935225486755, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.328125, "rewards/KeyDiagnosticEvidenceORM/std": 0.1458262950181961, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/mean_length": 799.7916870117188, "completions/min_length": 669.0, "entropy/max": 0.3759765625, "entropy/mean": 0.21044921875, "entropy/min": 0.138671875, "epoch": 0.27586206896551724, "frac_reward_zero_std": 0.0, "grad_norm": 0.12338030338287354, "learning_rate": 8.324869810563573e-07, "loss": -0.002525920746847987, "reward": 1.7456846237182617, "reward_std": 0.19652435183525085, "rewards/DiagnosisAccuracyORM/mean": 0.47832342982292175, "rewards/DiagnosisAccuracyORM/std": 0.29927995800971985, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2673611044883728, "rewards/KeyDiagnosticEvidenceORM/std": 0.12782466411590576, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/mean_length": 820.8125, "completions/min_length": 631.0, "entropy/max": 0.33203125, "entropy/mean": 0.21435546875, "entropy/min": 0.1396484375, "epoch": 0.2778904665314402, "frac_reward_zero_std": 0.0, "grad_norm": 0.15100005269050598, "learning_rate": 8.300760603356158e-07, "loss": -0.011019034311175346, "reward": 1.60636568069458, "reward_std": 0.2932201325893402, "rewards/DiagnosisAccuracyORM/mean": 0.3222222328186035, "rewards/DiagnosisAccuracyORM/std": 0.3149230480194092, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28414350748062134, "rewards/KeyDiagnosticEvidenceORM/std": 0.12063764780759811, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/mean_length": 829.2291870117188, "completions/min_length": 693.0, "entropy/max": 0.4189453125, "entropy/mean": 0.25439453125, "entropy/min": 0.173828125, "epoch": 0.2799188640973631, "frac_reward_zero_std": 0.0, "grad_norm": 0.1281760036945343, "learning_rate": 8.276514600391271e-07, "loss": -0.006916274782270193, "reward": 1.8124423027038574, "reward_std": 0.27694591879844666, "rewards/DiagnosisAccuracyORM/mean": 0.5352430939674377, "rewards/DiagnosisAccuracyORM/std": 0.343967080116272, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2771990895271301, "rewards/KeyDiagnosticEvidenceORM/std": 0.13444297015666962, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/mean_length": 810.8125, "completions/min_length": 687.0, "entropy/max": 0.4775390625, "entropy/mean": 0.23974609375, "entropy/min": 0.14892578125, "epoch": 0.281947261663286, "frac_reward_zero_std": 0.0, "grad_norm": 0.1537797749042511, "learning_rate": 8.25213280651317e-07, "loss": -0.0005926539888605475, "reward": 1.7613922357559204, "reward_std": 0.3012186586856842, "rewards/DiagnosisAccuracyORM/mean": 0.48477184772491455, "rewards/DiagnosisAccuracyORM/std": 0.3354915380477905, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27662038803100586, "rewards/KeyDiagnosticEvidenceORM/std": 0.12195305526256561, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/mean_length": 808.25, "completions/min_length": 643.0, "entropy/max": 0.384765625, "entropy/mean": 0.23095703125, "entropy/min": 0.1376953125, "epoch": 0.2839756592292089, "frac_reward_zero_std": 0.0, "grad_norm": 0.15346093475818634, "learning_rate": 8.227616232193792e-07, "loss": -0.001694927690550685, "reward": 1.624768614768982, "reward_std": 0.15867877006530762, "rewards/DiagnosisAccuracyORM/mean": 0.33888888359069824, "rewards/DiagnosisAccuracyORM/std": 0.2808576226234436, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28587964177131653, "rewards/KeyDiagnosticEvidenceORM/std": 0.13153813779354095, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/mean_length": 833.4791870117188, "completions/min_length": 636.0, "entropy/max": 0.4931640625, "entropy/mean": 0.25390625, "entropy/min": 0.15234375, "epoch": 0.28600405679513186, "frac_reward_zero_std": 0.0, "grad_norm": 0.1458454579114914, "learning_rate": 8.202965893490876e-07, "loss": 0.009878119453787804, "reward": 1.623478889465332, "reward_std": 0.2162424921989441, "rewards/DiagnosisAccuracyORM/mean": 0.3827381134033203, "rewards/DiagnosisAccuracyORM/std": 0.24826288223266602, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24074073135852814, "rewards/KeyDiagnosticEvidenceORM/std": 0.08639629930257797, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/mean_length": 804.8958740234375, "completions/min_length": 642.0, "entropy/max": 0.3154296875, "entropy/mean": 0.197265625, "entropy/min": 0.14599609375, "epoch": 0.2880324543610548, "frac_reward_zero_std": 0.0, "grad_norm": 0.12224603444337845, "learning_rate": 8.178182812005852e-07, "loss": 0.012797107920050621, "reward": 1.7461806535720825, "reward_std": 0.2027672827243805, "rewards/DiagnosisAccuracyORM/mean": 0.39722225069999695, "rewards/DiagnosisAccuracyORM/std": 0.2530740797519684, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3489583432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.1227511465549469, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/mean_length": 837.7916870117188, "completions/min_length": 685.0, "entropy/max": 0.4150390625, "entropy/mean": 0.244140625, "entropy/min": 0.14892578125, "epoch": 0.29006085192697767, "frac_reward_zero_std": 0.0, "grad_norm": 0.12430359423160553, "learning_rate": 8.153268014841506e-07, "loss": -0.0017951478948816657, "reward": 1.6250495910644531, "reward_std": 0.2233019471168518, "rewards/DiagnosisAccuracyORM/mean": 0.3802579343318939, "rewards/DiagnosisAccuracyORM/std": 0.3264061510562897, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2447916716337204, "rewards/KeyDiagnosticEvidenceORM/std": 0.1289793699979782, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/mean_length": 822.0416870117188, "completions/min_length": 678.0, "entropy/max": 0.326171875, "entropy/mean": 0.23486328125, "entropy/min": 0.15478515625, "epoch": 0.2920892494929006, "frac_reward_zero_std": 0.0, "grad_norm": 0.13054370880126953, "learning_rate": 8.128222534559406e-07, "loss": 0.010875615291297436, "reward": 1.7079862356185913, "reward_std": 0.2259819507598877, "rewards/DiagnosisAccuracyORM/mean": 0.36076387763023376, "rewards/DiagnosisAccuracyORM/std": 0.3185088038444519, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3472222089767456, "rewards/KeyDiagnosticEvidenceORM/std": 0.1064322218298912, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 803.8125, "completions/min_length": 661.0, "entropy/max": 0.3623046875, "entropy/mean": 0.2138671875, "entropy/min": 0.13623046875, "epoch": 0.29411764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 0.1293053925037384, "learning_rate": 8.103047409137114e-07, "loss": 0.012729689478874207, "reward": 1.8618056774139404, "reward_std": 0.35442304611206055, "rewards/DiagnosisAccuracyORM/mean": 0.4868055582046509, "rewards/DiagnosisAccuracyORM/std": 0.33093032240867615, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.375, "rewards/KeyDiagnosticEvidenceORM/std": 0.18611861765384674, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/mean_length": 808.25, "completions/min_length": 662.0, "entropy/max": 0.330078125, "entropy/mean": 0.232421875, "entropy/min": 0.173828125, "epoch": 0.2961460446247465, "frac_reward_zero_std": 0.0, "grad_norm": 0.12310237437486649, "learning_rate": 8.07774368192517e-07, "loss": 0.014468032866716385, "reward": 1.8291584253311157, "reward_std": 0.16886404156684875, "rewards/DiagnosisAccuracyORM/mean": 0.5684523582458496, "rewards/DiagnosisAccuracyORM/std": 0.3094799518585205, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2607060372829437, "rewards/KeyDiagnosticEvidenceORM/std": 0.13779830932617188, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/mean_length": 813.125, "completions/min_length": 607.0, "entropy/max": 0.4921875, "entropy/mean": 0.2470703125, "entropy/min": 0.1640625, "epoch": 0.29817444219066935, "frac_reward_zero_std": 0.0, "grad_norm": 0.14252594113349915, "learning_rate": 8.052312401603847e-07, "loss": 0.00958841573446989, "reward": 1.7891204357147217, "reward_std": 0.24142566323280334, "rewards/DiagnosisAccuracyORM/mean": 0.4621528089046478, "rewards/DiagnosisAccuracyORM/std": 0.34558576345443726, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32696759700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.1406203657388687, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/mean_length": 805.6458740234375, "completions/min_length": 662.0, "entropy/max": 0.28515625, "entropy/mean": 0.2080078125, "entropy/min": 0.13037109375, "epoch": 0.3002028397565923, "frac_reward_zero_std": 0.0, "grad_norm": 0.1216658353805542, "learning_rate": 8.02675462213969e-07, "loss": -0.00578635698184371, "reward": 1.7929399013519287, "reward_std": 0.190058171749115, "rewards/DiagnosisAccuracyORM/mean": 0.4520833492279053, "rewards/DiagnosisAccuracyORM/std": 0.25528979301452637, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3408564627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.09953588992357254, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/mean_length": 817.1041870117188, "completions/min_length": 670.0, "entropy/max": 0.4677734375, "entropy/mean": 0.2490234375, "entropy/min": 0.16064453125, "epoch": 0.3022312373225152, "frac_reward_zero_std": 0.0, "grad_norm": 0.1556999236345291, "learning_rate": 8.001071402741842e-07, "loss": 0.007764945738017559, "reward": 1.6957671642303467, "reward_std": 0.19492460787296295, "rewards/DiagnosisAccuracyORM/mean": 0.4156745970249176, "rewards/DiagnosisAccuracyORM/std": 0.2646826207637787, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28009259700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.12393933534622192, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/mean_length": 805.625, "completions/min_length": 665.0, "entropy/max": 0.3076171875, "entropy/mean": 0.18603515625, "entropy/min": 0.1259765625, "epoch": 0.30425963488843816, "frac_reward_zero_std": 0.0, "grad_norm": 0.1396939605474472, "learning_rate": 7.975263807818136e-07, "loss": -0.003007782157510519, "reward": 1.8500826358795166, "reward_std": 0.21808230876922607, "rewards/DiagnosisAccuracyORM/mean": 0.4501984119415283, "rewards/DiagnosisAccuracyORM/std": 0.27449294924736023, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.39988425374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.10019347071647644, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/mean_length": 814.2083740234375, "completions/min_length": 642.0, "entropy/max": 0.30859375, "entropy/mean": 0.2080078125, "entropy/min": 0.14990234375, "epoch": 0.30628803245436104, "frac_reward_zero_std": 0.0, "grad_norm": 0.1417379230260849, "learning_rate": 7.949332906930994e-07, "loss": 0.00827902089804411, "reward": 1.719527244567871, "reward_std": 0.17967236042022705, "rewards/DiagnosisAccuracyORM/mean": 0.4484623372554779, "rewards/DiagnosisAccuracyORM/std": 0.34195587038993835, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.271064817905426, "rewards/KeyDiagnosticEvidenceORM/std": 0.13770873844623566, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/mean_length": 815.2291870117188, "completions/min_length": 661.0, "entropy/max": 0.3232421875, "entropy/mean": 0.216796875, "entropy/min": 0.13671875, "epoch": 0.30831643002028397, "frac_reward_zero_std": 0.0, "grad_norm": 0.12745344638824463, "learning_rate": 7.923279774753091e-07, "loss": 0.0013674895744770765, "reward": 1.9675265550613403, "reward_std": 0.2540363669395447, "rewards/DiagnosisAccuracyORM/mean": 0.631299614906311, "rewards/DiagnosisAccuracyORM/std": 0.32035335898399353, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33622685074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.12090951204299927, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/mean_length": 805.2708740234375, "completions/min_length": 600.0, "entropy/max": 0.32421875, "entropy/mean": 0.21484375, "entropy/min": 0.142578125, "epoch": 0.3103448275862069, "frac_reward_zero_std": 0.0, "grad_norm": 0.12478020787239075, "learning_rate": 7.897105491022817e-07, "loss": -0.016588037833571434, "reward": 1.6464699506759644, "reward_std": 0.279399573802948, "rewards/DiagnosisAccuracyORM/mean": 0.36319446563720703, "rewards/DiagnosisAccuracyORM/std": 0.292214959859848, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2832754850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.15070153772830963, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 811.2708740234375, "completions/min_length": 679.0, "entropy/max": 0.2734375, "entropy/mean": 0.1962890625, "entropy/min": 0.13818359375, "epoch": 0.31237322515212984, "frac_reward_zero_std": 0.0, "grad_norm": 0.11869263648986816, "learning_rate": 7.870811140499542e-07, "loss": -0.008451782166957855, "reward": 1.8186343908309937, "reward_std": 0.27376216650009155, "rewards/DiagnosisAccuracyORM/mean": 0.5298611521720886, "rewards/DiagnosisAccuracyORM/std": 0.27688658237457275, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28877314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.13749173283576965, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/mean_length": 818.9791870117188, "completions/min_length": 641.0, "entropy/max": 0.349609375, "entropy/mean": 0.22705078125, "entropy/min": 0.1259765625, "epoch": 0.3144016227180527, "frac_reward_zero_std": 0.0, "grad_norm": 0.15424081683158875, "learning_rate": 7.844397812918635e-07, "loss": -0.014082526788115501, "reward": 1.6536706686019897, "reward_std": 0.18972930312156677, "rewards/DiagnosisAccuracyORM/mean": 0.3380456268787384, "rewards/DiagnosisAccuracyORM/std": 0.2672125995159149, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31562501192092896, "rewards/KeyDiagnosticEvidenceORM/std": 0.12935177981853485, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/mean_length": 808.3333740234375, "completions/min_length": 674.0, "entropy/max": 0.44921875, "entropy/mean": 0.234375, "entropy/min": 0.17041015625, "epoch": 0.31643002028397565, "frac_reward_zero_std": 0.0, "grad_norm": 0.14738547801971436, "learning_rate": 7.817866602946325e-07, "loss": 0.007538774982094765, "reward": 1.8602266311645508, "reward_std": 0.2450771927833557, "rewards/DiagnosisAccuracyORM/mean": 0.5172867178916931, "rewards/DiagnosisAccuracyORM/std": 0.310725599527359, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3429398238658905, "rewards/KeyDiagnosticEvidenceORM/std": 0.14720377326011658, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/mean_length": 819.3541870117188, "completions/min_length": 652.0, "entropy/max": 0.3271484375, "entropy/mean": 0.212890625, "entropy/min": 0.142578125, "epoch": 0.3184584178498986, "frac_reward_zero_std": 0.0, "grad_norm": 0.13093268871307373, "learning_rate": 7.791218610134322e-07, "loss": 0.014131969772279263, "reward": 1.788872480392456, "reward_std": 0.27587971091270447, "rewards/DiagnosisAccuracyORM/mean": 0.46016862988471985, "rewards/DiagnosisAccuracyORM/std": 0.3002402186393738, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32870370149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.10555451363325119, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/mean_length": 821.9166870117188, "completions/min_length": 621.0, "entropy/max": 0.3466796875, "entropy/mean": 0.2177734375, "entropy/min": 0.128662109375, "epoch": 0.3204868154158215, "frac_reward_zero_std": 0.0, "grad_norm": 0.1306992471218109, "learning_rate": 7.764454938874251e-07, "loss": 0.016994360834360123, "reward": 1.8540509939193726, "reward_std": 0.20125925540924072, "rewards/DiagnosisAccuracyORM/mean": 0.5340277552604675, "rewards/DiagnosisAccuracyORM/std": 0.3363684415817261, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32002314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.1349913626909256, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/mean_length": 831.2916870117188, "completions/min_length": 708.0, "entropy/max": 0.4228515625, "entropy/mean": 0.2265625, "entropy/min": 0.119384765625, "epoch": 0.3225152129817444, "frac_reward_zero_std": 0.0, "grad_norm": 0.12497025728225708, "learning_rate": 7.737576698351878e-07, "loss": 0.005258071236312389, "reward": 1.6677249670028687, "reward_std": 0.18029798567295074, "rewards/DiagnosisAccuracyORM/mean": 0.4125165045261383, "rewards/DiagnosisAccuracyORM/std": 0.2775883674621582, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2552083432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.12497127056121826, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/mean_length": 795.6458740234375, "completions/min_length": 646.0, "entropy/max": 0.23583984375, "entropy/mean": 0.181640625, "entropy/min": 0.14453125, "epoch": 0.32454361054766734, "frac_reward_zero_std": 0.0, "grad_norm": 0.1224096342921257, "learning_rate": 7.710585002501145e-07, "loss": -0.006692828144878149, "reward": 1.9619710445404053, "reward_std": 0.22811472415924072, "rewards/DiagnosisAccuracyORM/mean": 0.6083829402923584, "rewards/DiagnosisAccuracyORM/std": 0.3609192371368408, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35358795523643494, "rewards/KeyDiagnosticEvidenceORM/std": 0.16184307634830475, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/mean_length": 792.7916870117188, "completions/min_length": 651.0, "entropy/max": 0.3447265625, "entropy/mean": 0.2080078125, "entropy/min": 0.146484375, "epoch": 0.3265720081135903, "frac_reward_zero_std": 0.0, "grad_norm": 0.15576981008052826, "learning_rate": 7.683480969958003e-07, "loss": -0.021036680787801743, "reward": 1.5362269878387451, "reward_std": 0.1664705127477646, "rewards/DiagnosisAccuracyORM/mean": 0.2902778089046478, "rewards/DiagnosisAccuracyORM/std": 0.23246631026268005, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24594907462596893, "rewards/KeyDiagnosticEvidenceORM/std": 0.09874933958053589, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/mean_length": 807.2708740234375, "completions/min_length": 674.0, "entropy/max": 0.2861328125, "entropy/mean": 0.20263671875, "entropy/min": 0.1396484375, "epoch": 0.3286004056795132, "frac_reward_zero_std": 0.0, "grad_norm": 0.14599521458148956, "learning_rate": 7.656265724014053e-07, "loss": -0.006965611129999161, "reward": 1.803918719291687, "reward_std": 0.2576114535331726, "rewards/DiagnosisAccuracyORM/mean": 0.4422288239002228, "rewards/DiagnosisAccuracyORM/std": 0.2904678285121918, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.36168980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.11044266819953918, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/mean_length": 800.625, "completions/min_length": 614.0, "entropy/max": 0.29296875, "entropy/mean": 0.20703125, "entropy/min": 0.154296875, "epoch": 0.3306288032454361, "frac_reward_zero_std": 0.0, "grad_norm": 0.10831254720687866, "learning_rate": 7.628940392569993e-07, "loss": 0.0021736486814916134, "reward": 1.7550761699676514, "reward_std": 0.15753167867660522, "rewards/DiagnosisAccuracyORM/mean": 0.43968257308006287, "rewards/DiagnosisAccuracyORM/std": 0.3391660451889038, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31539350748062134, "rewards/KeyDiagnosticEvidenceORM/std": 0.12908540666103363, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 822.0208740234375, "completions/min_length": 661.0, "entropy/max": 0.26171875, "entropy/mean": 0.19677734375, "entropy/min": 0.14501953125, "epoch": 0.332657200811359, "frac_reward_zero_std": 0.0, "grad_norm": 0.12853948771953583, "learning_rate": 7.601506108088873e-07, "loss": -0.0057694269344210625, "reward": 1.7557871341705322, "reward_std": 0.3303091526031494, "rewards/DiagnosisAccuracyORM/mean": 0.4513889253139496, "rewards/DiagnosisAccuracyORM/std": 0.35271376371383667, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30439814925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.10886596143245697, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/mean_length": 832.2291870117188, "completions/min_length": 677.0, "entropy/max": 0.3466796875, "entropy/mean": 0.24560546875, "entropy/min": 0.16015625, "epoch": 0.33468559837728196, "frac_reward_zero_std": 0.0, "grad_norm": 0.14733770489692688, "learning_rate": 7.573964007549154e-07, "loss": 0.012863783165812492, "reward": 1.6667990684509277, "reward_std": 0.26445168256759644, "rewards/DiagnosisAccuracyORM/mean": 0.4092757999897003, "rewards/DiagnosisAccuracyORM/std": 0.29792678356170654, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25752314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.12405931949615479, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/mean_length": 806.8125, "completions/min_length": 653.0, "entropy/max": 0.4150390625, "entropy/mean": 0.22998046875, "entropy/min": 0.1513671875, "epoch": 0.3367139959432049, "frac_reward_zero_std": 0.0, "grad_norm": 0.13278275728225708, "learning_rate": 7.5463152323976e-07, "loss": -0.010849381797015667, "reward": 1.84375, "reward_std": 0.2715475559234619, "rewards/DiagnosisAccuracyORM/mean": 0.5243055820465088, "rewards/DiagnosisAccuracyORM/std": 0.2958930432796478, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3194444477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.11125876754522324, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/mean_length": 811.0833740234375, "completions/min_length": 666.0, "entropy/max": 0.2744140625, "entropy/mean": 0.19970703125, "entropy/min": 0.1318359375, "epoch": 0.33874239350912777, "frac_reward_zero_std": 0.0, "grad_norm": 0.11617284268140793, "learning_rate": 7.518560928501968e-07, "loss": 0.01208893395960331, "reward": 1.7707176208496094, "reward_std": 0.19627505540847778, "rewards/DiagnosisAccuracyORM/mean": 0.4946759045124054, "rewards/DiagnosisAccuracyORM/std": 0.2837049961090088, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2760416567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.13564850389957428, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/mean_length": 817.2083740234375, "completions/min_length": 682.0, "entropy/max": 0.435546875, "entropy/mean": 0.23681640625, "entropy/min": 0.158203125, "epoch": 0.3407707910750507, "frac_reward_zero_std": 0.0, "grad_norm": 0.11067529767751694, "learning_rate": 7.490702246103512e-07, "loss": 0.0038489624857902527, "reward": 1.6617478132247925, "reward_std": 0.2313680797815323, "rewards/DiagnosisAccuracyORM/mean": 0.3776041567325592, "rewards/DiagnosisAccuracyORM/std": 0.24564263224601746, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2841435372829437, "rewards/KeyDiagnosticEvidenceORM/std": 0.11348521709442139, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/mean_length": 811.1458740234375, "completions/min_length": 662.0, "entropy/max": 0.4287109375, "entropy/mean": 0.23876953125, "entropy/min": 0.13818359375, "epoch": 0.34279918864097364, "frac_reward_zero_std": 0.0, "grad_norm": 0.1418542116880417, "learning_rate": 7.462740339769322e-07, "loss": -0.006590296979993582, "reward": 1.7987269163131714, "reward_std": 0.2529333233833313, "rewards/DiagnosisAccuracyORM/mean": 0.4960648715496063, "rewards/DiagnosisAccuracyORM/std": 0.308232843875885, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30266204476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.15526819229125977, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/mean_length": 806.2083740234375, "completions/min_length": 654.0, "entropy/max": 0.451171875, "entropy/mean": 0.2509765625, "entropy/min": 0.1494140625, "epoch": 0.3448275862068966, "frac_reward_zero_std": 0.0, "grad_norm": 0.14405110478401184, "learning_rate": 7.434676368344468e-07, "loss": 0.0035575437359511852, "reward": 1.4875166416168213, "reward_std": 0.2362123727798462, "rewards/DiagnosisAccuracyORM/mean": 0.288442462682724, "rewards/DiagnosisAccuracyORM/std": 0.3049767315387726, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.19907407462596893, "rewards/KeyDiagnosticEvidenceORM/std": 0.10255684703588486, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 798.4583740234375, "completions/min_length": 653.0, "entropy/max": 0.3984375, "entropy/mean": 0.21630859375, "entropy/min": 0.1484375, "epoch": 0.34685598377281945, "frac_reward_zero_std": 0.0, "grad_norm": 0.14083440601825714, "learning_rate": 7.406511494903981e-07, "loss": -0.0108948377892375, "reward": 1.7552084922790527, "reward_std": 0.22183626890182495, "rewards/DiagnosisAccuracyORM/mean": 0.4201389253139496, "rewards/DiagnosisAccuracyORM/std": 0.3394293189048767, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3350694179534912, "rewards/KeyDiagnosticEvidenceORM/std": 0.15208108723163605, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/mean_length": 802.8541870117188, "completions/min_length": 685.0, "entropy/max": 0.3193359375, "entropy/mean": 0.2119140625, "entropy/min": 0.15185546875, "epoch": 0.3488843813387424, "frac_reward_zero_std": 0.0, "grad_norm": 0.1563841551542282, "learning_rate": 7.378246886704638e-07, "loss": -0.009629467502236366, "reward": 1.9528934955596924, "reward_std": 0.3057028353214264, "rewards/DiagnosisAccuracyORM/mean": 0.5871527791023254, "rewards/DiagnosisAccuracyORM/std": 0.3004181385040283, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.36574074625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.1430618166923523, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/mean_length": 812.125, "completions/min_length": 638.0, "entropy/max": 0.2763671875, "entropy/mean": 0.20361328125, "entropy/min": 0.14599609375, "epoch": 0.3509127789046653, "frac_reward_zero_std": 0.0, "grad_norm": 0.13172473013401031, "learning_rate": 7.3498837151366e-07, "loss": 0.003041620133444667, "reward": 1.7174769639968872, "reward_std": 0.2310505360364914, "rewards/DiagnosisAccuracyORM/mean": 0.3899305760860443, "rewards/DiagnosisAccuracyORM/std": 0.2640102505683899, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32754629850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.10518447309732437, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/mean_length": 847.7708740234375, "completions/min_length": 686.0, "entropy/max": 0.33203125, "entropy/mean": 0.2373046875, "entropy/min": 0.1591796875, "epoch": 0.35294117647058826, "frac_reward_zero_std": 0.0, "grad_norm": 0.13001905381679535, "learning_rate": 7.321423155674857e-07, "loss": 0.007571822963654995, "reward": 1.8531994819641113, "reward_std": 0.17124193906784058, "rewards/DiagnosisAccuracyORM/mean": 0.5042410492897034, "rewards/DiagnosisAccuracyORM/std": 0.2655965983867645, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3489583432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.15607669949531555, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/mean_length": 823.6875, "completions/min_length": 651.0, "entropy/max": 0.3427734375, "entropy/mean": 0.22216796875, "entropy/min": 0.15087890625, "epoch": 0.35496957403651114, "frac_reward_zero_std": 0.0, "grad_norm": 0.1321919858455658, "learning_rate": 7.292866387830514e-07, "loss": -0.01319266390055418, "reward": 1.9716932773590088, "reward_std": 0.2236272245645523, "rewards/DiagnosisAccuracyORM/mean": 0.6094245910644531, "rewards/DiagnosisAccuracyORM/std": 0.30607736110687256, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3622685372829437, "rewards/KeyDiagnosticEvidenceORM/std": 0.13103795051574707, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/mean_length": 817.1458740234375, "completions/min_length": 665.0, "entropy/max": 0.322265625, "entropy/mean": 0.2265625, "entropy/min": 0.1357421875, "epoch": 0.35699797160243407, "frac_reward_zero_std": 0.0, "grad_norm": 0.1385192573070526, "learning_rate": 7.264214595101912e-07, "loss": -0.015549970790743828, "reward": 1.7270750999450684, "reward_std": 0.24067558348178864, "rewards/DiagnosisAccuracyORM/mean": 0.43425098061561584, "rewards/DiagnosisAccuracyORM/std": 0.22901983559131622, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29282405972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.10871504992246628, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/mean_length": 815.8958740234375, "completions/min_length": 664.0, "entropy/max": 0.2734375, "entropy/mean": 0.19775390625, "entropy/min": 0.142578125, "epoch": 0.359026369168357, "frac_reward_zero_std": 0.0, "grad_norm": 0.10887026786804199, "learning_rate": 7.23546896492557e-07, "loss": -0.000484741001855582, "reward": 1.9091269969940186, "reward_std": 0.30721133947372437, "rewards/DiagnosisAccuracyORM/mean": 0.570585310459137, "rewards/DiagnosisAccuracyORM/std": 0.3762955069541931, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3385416567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.11540979146957397, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/mean_length": 818.125, "completions/min_length": 671.0, "entropy/max": 0.4443359375, "entropy/mean": 0.24267578125, "entropy/min": 0.15771484375, "epoch": 0.36105476673427994, "frac_reward_zero_std": 0.0, "grad_norm": 0.13197927176952362, "learning_rate": 7.20663068862698e-07, "loss": 0.009382961317896843, "reward": 1.8220734596252441, "reward_std": 0.22837311029434204, "rewards/DiagnosisAccuracyORM/mean": 0.5026289820671082, "rewards/DiagnosisAccuracyORM/std": 0.36924055218696594, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3194444477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.12422599643468857, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/mean_length": 832.1458740234375, "completions/min_length": 653.0, "entropy/max": 0.376953125, "entropy/mean": 0.2470703125, "entropy/min": 0.15283203125, "epoch": 0.3630831643002028, "frac_reward_zero_std": 0.0, "grad_norm": 0.12309635430574417, "learning_rate": 7.177700961371238e-07, "loss": -0.003071139333769679, "reward": 1.823379635810852, "reward_std": 0.1413145363330841, "rewards/DiagnosisAccuracyORM/mean": 0.5357639193534851, "rewards/DiagnosisAccuracyORM/std": 0.3193890154361725, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28761574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.17591458559036255, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/mean_length": 828.0833740234375, "completions/min_length": 629.0, "entropy/max": 0.24755859375, "entropy/mean": 0.19384765625, "entropy/min": 0.14208984375, "epoch": 0.36511156186612576, "frac_reward_zero_std": 0.0, "grad_norm": 0.12363279610872269, "learning_rate": 7.148680982113501e-07, "loss": -0.006454914808273315, "reward": 1.6655919551849365, "reward_std": 0.18857401609420776, "rewards/DiagnosisAccuracyORM/mean": 0.3692956268787384, "rewards/DiagnosisAccuracyORM/std": 0.3217686414718628, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29629626870155334, "rewards/KeyDiagnosticEvidenceORM/std": 0.13233140110969543, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/mean_length": 787.7083740234375, "completions/min_length": 598.0, "entropy/max": 0.3125, "entropy/mean": 0.21484375, "entropy/min": 0.132568359375, "epoch": 0.3671399594320487, "frac_reward_zero_std": 0.0, "grad_norm": 0.14088639616966248, "learning_rate": 7.119571953549304e-07, "loss": 0.005591270979493856, "reward": 1.8092262744903564, "reward_std": 0.2675517797470093, "rewards/DiagnosisAccuracyORM/mean": 0.5245040059089661, "rewards/DiagnosisAccuracyORM/std": 0.26315808296203613, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.284722238779068, "rewards/KeyDiagnosticEvidenceORM/std": 0.12468768656253815, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/mean_length": 824.7708740234375, "completions/min_length": 672.0, "entropy/max": 0.4365234375, "entropy/mean": 0.22705078125, "entropy/min": 0.15185546875, "epoch": 0.3691683569979716, "frac_reward_zero_std": 0.0, "grad_norm": 0.13629822432994843, "learning_rate": 7.090375082064717e-07, "loss": -0.013653427362442017, "reward": 1.660011649131775, "reward_std": 0.166181743144989, "rewards/DiagnosisAccuracyORM/mean": 0.42505788803100586, "rewards/DiagnosisAccuracyORM/std": 0.2803799510002136, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.23495371639728546, "rewards/KeyDiagnosticEvidenceORM/std": 0.07939023524522781, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/mean_length": 832.0833740234375, "completions/min_length": 700.0, "entropy/max": 0.427734375, "entropy/mean": 0.24462890625, "entropy/min": 0.15576171875, "epoch": 0.3711967545638945, "frac_reward_zero_std": 0.0, "grad_norm": 0.15138640999794006, "learning_rate": 7.061091577686349e-07, "loss": 0.008758465759456158, "reward": 1.719642996788025, "reward_std": 0.26372066140174866, "rewards/DiagnosisAccuracyORM/mean": 0.41640210151672363, "rewards/DiagnosisAccuracyORM/std": 0.2826075255870819, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30324074625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.16063079237937927, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/mean_length": 814.7083740234375, "completions/min_length": 625.0, "entropy/max": 0.30078125, "entropy/mean": 0.20556640625, "entropy/min": 0.14111328125, "epoch": 0.37322515212981744, "frac_reward_zero_std": 0.0, "grad_norm": 0.10586052387952805, "learning_rate": 7.031722654031192e-07, "loss": 0.008257612586021423, "reward": 1.6930224895477295, "reward_std": 0.21898022294044495, "rewards/DiagnosisAccuracyORM/mean": 0.36026787757873535, "rewards/DiagnosisAccuracyORM/std": 0.2982836365699768, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33275464177131653, "rewards/KeyDiagnosticEvidenceORM/std": 0.11847489327192307, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/mean_length": 807.2083740234375, "completions/min_length": 625.0, "entropy/max": 0.2822265625, "entropy/mean": 0.2138671875, "entropy/min": 0.1552734375, "epoch": 0.3752535496957404, "frac_reward_zero_std": 0.0, "grad_norm": 0.12668131291866302, "learning_rate": 7.002269528256332e-07, "loss": -0.0010716419201344252, "reward": 1.7508102655410767, "reward_std": 0.23681201040744781, "rewards/DiagnosisAccuracyORM/mean": 0.43715277314186096, "rewards/DiagnosisAccuracyORM/std": 0.30672600865364075, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31365740299224854, "rewards/KeyDiagnosticEvidenceORM/std": 0.1378755420446396, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/mean_length": 808.3541870117188, "completions/min_length": 651.0, "entropy/max": 0.3046875, "entropy/mean": 0.20947265625, "entropy/min": 0.15869140625, "epoch": 0.3772819472616633, "frac_reward_zero_std": 0.0, "grad_norm": 0.15039238333702087, "learning_rate": 6.972733421008504e-07, "loss": 0.022718658670783043, "reward": 1.8097221851348877, "reward_std": 0.2113858461380005, "rewards/DiagnosisAccuracyORM/mean": 0.5041666626930237, "rewards/DiagnosisAccuracyORM/std": 0.27033162117004395, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3055555522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.1234305277466774, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/mean_length": 801.3333740234375, "completions/min_length": 639.0, "entropy/max": 0.4736328125, "entropy/mean": 0.24462890625, "entropy/min": 0.14892578125, "epoch": 0.3793103448275862, "frac_reward_zero_std": 0.0, "grad_norm": 0.13747984170913696, "learning_rate": 6.943115556373502e-07, "loss": 0.014971653930842876, "reward": 1.7746198177337646, "reward_std": 0.27712953090667725, "rewards/DiagnosisAccuracyORM/mean": 0.48700395226478577, "rewards/DiagnosisAccuracyORM/std": 0.37048304080963135, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28761574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.1594671905040741, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/mean_length": 814.8541870117188, "completions/min_length": 677.0, "entropy/max": 0.306640625, "entropy/mean": 0.2275390625, "entropy/min": 0.15380859375, "epoch": 0.3813387423935091, "frac_reward_zero_std": 0.0, "grad_norm": 0.13751107454299927, "learning_rate": 6.913417161825449e-07, "loss": -0.00549054890871048, "reward": 1.5904762744903564, "reward_std": 0.19034487009048462, "rewards/DiagnosisAccuracyORM/mean": 0.2918650805950165, "rewards/DiagnosisAccuracyORM/std": 0.2510434687137604, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2986111342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.10526898503303528, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 799.4375, "completions/min_length": 680.0, "entropy/max": 0.2958984375, "entropy/mean": 0.2041015625, "entropy/min": 0.13427734375, "epoch": 0.38336713995943206, "frac_reward_zero_std": 0.0, "grad_norm": 0.12616066634655, "learning_rate": 6.883639468175925e-07, "loss": -0.002566153882071376, "reward": 1.928703784942627, "reward_std": 0.25847694277763367, "rewards/DiagnosisAccuracyORM/mean": 0.5670139193534851, "rewards/DiagnosisAccuracyORM/std": 0.2934567928314209, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.36168980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.10984646528959274, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 794.8125, "completions/min_length": 668.0, "entropy/max": 0.294921875, "entropy/mean": 0.19091796875, "entropy/min": 0.13232421875, "epoch": 0.385395537525355, "frac_reward_zero_std": 0.0, "grad_norm": 0.1266159862279892, "learning_rate": 6.853783709522962e-07, "loss": -0.00792091153562069, "reward": 1.716435194015503, "reward_std": 0.11568476259708405, "rewards/DiagnosisAccuracyORM/mean": 0.4045139253139496, "rewards/DiagnosisAccuracyORM/std": 0.2959485948085785, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3119213283061981, "rewards/KeyDiagnosticEvidenceORM/std": 0.09384799748659134, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/mean_length": 837.9166870117188, "completions/min_length": 697.0, "entropy/max": 0.3173828125, "entropy/mean": 0.21923828125, "entropy/min": 0.15087890625, "epoch": 0.38742393509127787, "frac_reward_zero_std": 0.0, "grad_norm": 0.1435844749212265, "learning_rate": 6.823851123199893e-07, "loss": 0.0011395129840821028, "reward": 1.772453784942627, "reward_std": 0.21827876567840576, "rewards/DiagnosisAccuracyORM/mean": 0.4993055760860443, "rewards/DiagnosisAccuracyORM/std": 0.3307447135448456, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27314814925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.1023966372013092, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/mean_length": 810.3958740234375, "completions/min_length": 639.0, "entropy/max": 0.33203125, "entropy/mean": 0.212890625, "entropy/min": 0.14990234375, "epoch": 0.3894523326572008, "frac_reward_zero_std": 0.0, "grad_norm": 0.12165579199790955, "learning_rate": 6.793842949724074e-07, "loss": 0.01038497593253851, "reward": 1.736689805984497, "reward_std": 0.2092963010072708, "rewards/DiagnosisAccuracyORM/mean": 0.4392361342906952, "rewards/DiagnosisAccuracyORM/std": 0.252657026052475, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29745373129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.13066156208515167, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/mean_length": 819.5416870117188, "completions/min_length": 653.0, "entropy/max": 0.3857421875, "entropy/mean": 0.216796875, "entropy/min": 0.13525390625, "epoch": 0.39148073022312374, "frac_reward_zero_std": 0.0, "grad_norm": 0.13783778250217438, "learning_rate": 6.763760432745474e-07, "loss": 0.004274055361747742, "reward": 1.6948496103286743, "reward_std": 0.2857670783996582, "rewards/DiagnosisAccuracyORM/mean": 0.3985532522201538, "rewards/DiagnosisAccuracyORM/std": 0.3059673309326172, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29629629850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.11450681835412979, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/mean_length": 815.5, "completions/min_length": 652.0, "entropy/max": 0.4326171875, "entropy/mean": 0.22998046875, "entropy/min": 0.15478515625, "epoch": 0.3935091277890467, "frac_reward_zero_std": 0.0, "grad_norm": 0.11962641775608063, "learning_rate": 6.733604818995132e-07, "loss": -0.0003544638748280704, "reward": 1.7257441282272339, "reward_std": 0.22571013867855072, "rewards/DiagnosisAccuracyORM/mean": 0.3924107253551483, "rewards/DiagnosisAccuracyORM/std": 0.3154711127281189, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3333333432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.16267885267734528, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/mean_length": 813.1666870117188, "completions/min_length": 670.0, "entropy/max": 0.2724609375, "entropy/mean": 0.20751953125, "entropy/min": 0.14453125, "epoch": 0.39553752535496955, "frac_reward_zero_std": 0.0, "grad_norm": 0.13925455510616302, "learning_rate": 6.703377358233489e-07, "loss": 0.02255905047059059, "reward": 1.8328373432159424, "reward_std": 0.21425092220306396, "rewards/DiagnosisAccuracyORM/mean": 0.4717261791229248, "rewards/DiagnosisAccuracyORM/std": 0.28155517578125, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3611111342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.18708638846874237, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/mean_length": 831.0208740234375, "completions/min_length": 724.0, "entropy/max": 0.3408203125, "entropy/mean": 0.21875, "entropy/min": 0.1533203125, "epoch": 0.3975659229208925, "frac_reward_zero_std": 0.0, "grad_norm": 0.14825904369354248, "learning_rate": 6.67307930319859e-07, "loss": 0.007414219435304403, "reward": 2.1019015312194824, "reward_std": 0.2689271867275238, "rewards/DiagnosisAccuracyORM/mean": 0.7031745910644531, "rewards/DiagnosisAccuracyORM/std": 0.28089413046836853, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.39872685074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.14181198179721832, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1131.0, "completions/mean_length": 846.9791870117188, "completions/min_length": 699.0, "entropy/max": 0.318359375, "entropy/mean": 0.216796875, "entropy/min": 0.1396484375, "epoch": 0.3995943204868154, "frac_reward_zero_std": 0.0, "grad_norm": 0.12599466741085052, "learning_rate": 6.642711909554174e-07, "loss": 0.005293172784149647, "reward": 1.8114914894104004, "reward_std": 0.19861343502998352, "rewards/DiagnosisAccuracyORM/mean": 0.46889880299568176, "rewards/DiagnosisAccuracyORM/std": 0.25566598773002625, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34259259700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.13257929682731628, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/mean_length": 811.5, "completions/min_length": 675.0, "entropy/max": 0.3212890625, "entropy/mean": 0.18798828125, "entropy/min": 0.12060546875, "epoch": 0.40162271805273836, "frac_reward_zero_std": 0.0, "grad_norm": 0.11751898378133774, "learning_rate": 6.612276435837621e-07, "loss": -0.005138958804309368, "reward": 1.952314853668213, "reward_std": 0.25421255826950073, "rewards/DiagnosisAccuracyORM/mean": 0.5680555701255798, "rewards/DiagnosisAccuracyORM/std": 0.2910776138305664, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.38425925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.12240654975175858, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/mean_length": 798.2708740234375, "completions/min_length": 664.0, "entropy/max": 0.291015625, "entropy/mean": 0.20556640625, "entropy/min": 0.1328125, "epoch": 0.40365111561866124, "frac_reward_zero_std": 0.0, "grad_norm": 0.15449273586273193, "learning_rate": 6.581774143407809e-07, "loss": -0.002552064834162593, "reward": 1.6778273582458496, "reward_std": 0.14503905177116394, "rewards/DiagnosisAccuracyORM/mean": 0.40942463278770447, "rewards/DiagnosisAccuracyORM/std": 0.31549495458602905, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2684027850627899, "rewards/KeyDiagnosticEvidenceORM/std": 0.1015130877494812, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/mean_length": 823.6041870117188, "completions/min_length": 734.0, "entropy/max": 0.3662109375, "entropy/mean": 0.22509765625, "entropy/min": 0.1455078125, "epoch": 0.4056795131845842, "frac_reward_zero_std": 0.0, "grad_norm": 0.11598053574562073, "learning_rate": 6.551206296392826e-07, "loss": -0.012824932113289833, "reward": 1.8063658475875854, "reward_std": 0.21949326992034912, "rewards/DiagnosisAccuracyORM/mean": 0.45625004172325134, "rewards/DiagnosisAccuracyORM/std": 0.32193291187286377, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3501157760620117, "rewards/KeyDiagnosticEvidenceORM/std": 0.11341284960508347, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/mean_length": 815.2916870117188, "completions/min_length": 706.0, "entropy/max": 0.3466796875, "entropy/mean": 0.23974609375, "entropy/min": 0.15966796875, "epoch": 0.4077079107505071, "frac_reward_zero_std": 0.0, "grad_norm": 0.12757955491542816, "learning_rate": 6.52057416163759e-07, "loss": -0.010590429417788982, "reward": 1.836896538734436, "reward_std": 0.25220102071762085, "rewards/DiagnosisAccuracyORM/mean": 0.5458085536956787, "rewards/DiagnosisAccuracyORM/std": 0.29961973428726196, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29108795523643494, "rewards/KeyDiagnosticEvidenceORM/std": 0.13535571098327637, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/mean_length": 807.0208740234375, "completions/min_length": 681.0, "entropy/max": 0.30078125, "entropy/mean": 0.193359375, "entropy/min": 0.13037109375, "epoch": 0.40973630831643004, "frac_reward_zero_std": 0.0, "grad_norm": 0.1379665732383728, "learning_rate": 6.489879008651335e-07, "loss": 0.007141622714698315, "reward": 1.775719404220581, "reward_std": 0.19298383593559265, "rewards/DiagnosisAccuracyORM/mean": 0.5083581209182739, "rewards/DiagnosisAccuracyORM/std": 0.26528188586235046, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2673611044883728, "rewards/KeyDiagnosticEvidenceORM/std": 0.1342151015996933, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/mean_length": 810.6458740234375, "completions/min_length": 643.0, "entropy/max": 0.3310546875, "entropy/mean": 0.2080078125, "entropy/min": 0.14501953125, "epoch": 0.4117647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 0.12962564826011658, "learning_rate": 6.45912210955501e-07, "loss": -0.0003031840024050325, "reward": 1.7786706686019897, "reward_std": 0.26007741689682007, "rewards/DiagnosisAccuracyORM/mean": 0.46964287757873535, "rewards/DiagnosisAccuracyORM/std": 0.32699593901634216, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.309027761220932, "rewards/KeyDiagnosticEvidenceORM/std": 0.0988549217581749, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/mean_length": 797.125, "completions/min_length": 678.0, "entropy/max": 0.4755859375, "entropy/mean": 0.22119140625, "entropy/min": 0.13671875, "epoch": 0.41379310344827586, "frac_reward_zero_std": 0.0, "grad_norm": 0.1224638894200325, "learning_rate": 6.42830473902855e-07, "loss": 0.0004157125949859619, "reward": 1.8983796834945679, "reward_std": 0.1429564654827118, "rewards/DiagnosisAccuracyORM/mean": 0.6107639074325562, "rewards/DiagnosisAccuracyORM/std": 0.28354161977767944, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28761574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.13432081043720245, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/mean_length": 818.8333740234375, "completions/min_length": 685.0, "entropy/max": 0.2998046875, "entropy/mean": 0.1982421875, "entropy/min": 0.154296875, "epoch": 0.4158215010141988, "frac_reward_zero_std": 0.0, "grad_norm": 0.1259286105632782, "learning_rate": 6.397428174258047e-07, "loss": -0.0034779757261276245, "reward": 1.67359459400177, "reward_std": 0.22815613448619843, "rewards/DiagnosisAccuracyORM/mean": 0.38829365372657776, "rewards/DiagnosisAccuracyORM/std": 0.2598622143268585, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28530094027519226, "rewards/KeyDiagnosticEvidenceORM/std": 0.10491587966680527, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/mean_length": 810.3333740234375, "completions/min_length": 559.0, "entropy/max": 0.3642578125, "entropy/mean": 0.23046875, "entropy/min": 0.16357421875, "epoch": 0.4178498985801217, "frac_reward_zero_std": 0.0, "grad_norm": 0.12125129252672195, "learning_rate": 6.366493694882829e-07, "loss": 0.004212475381791592, "reward": 1.833870768547058, "reward_std": 0.2771860361099243, "rewards/DiagnosisAccuracyORM/mean": 0.558407723903656, "rewards/DiagnosisAccuracyORM/std": 0.2726533114910126, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27546295523643494, "rewards/KeyDiagnosticEvidenceORM/std": 0.11726926267147064, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/mean_length": 823.9583740234375, "completions/min_length": 674.0, "entropy/max": 0.31640625, "entropy/mean": 0.21142578125, "entropy/min": 0.14501953125, "epoch": 0.4198782961460446, "frac_reward_zero_std": 0.0, "grad_norm": 0.09340638667345047, "learning_rate": 6.335502582942408e-07, "loss": 0.0056250146590173244, "reward": 1.7742891311645508, "reward_std": 0.26391467452049255, "rewards/DiagnosisAccuracyORM/mean": 0.5268353223800659, "rewards/DiagnosisAccuracyORM/std": 0.31834128499031067, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2474536895751953, "rewards/KeyDiagnosticEvidenceORM/std": 0.09474613517522812, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/mean_length": 836.0, "completions/min_length": 646.0, "entropy/max": 0.3544921875, "entropy/mean": 0.23095703125, "entropy/min": 0.158203125, "epoch": 0.42190669371196754, "frac_reward_zero_std": 0.0, "grad_norm": 0.1317174732685089, "learning_rate": 6.304456122823376e-07, "loss": 0.013005483895540237, "reward": 1.6287286281585693, "reward_std": 0.22599366307258606, "rewards/DiagnosisAccuracyORM/mean": 0.36599698662757874, "rewards/DiagnosisAccuracyORM/std": 0.2814234793186188, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2627314627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.11644388735294342, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/mean_length": 812.4375, "completions/min_length": 695.0, "entropy/max": 0.3271484375, "entropy/mean": 0.203125, "entropy/min": 0.13671875, "epoch": 0.4239350912778905, "frac_reward_zero_std": 0.0, "grad_norm": 0.13460104167461395, "learning_rate": 6.273355601206143e-07, "loss": 0.0013979425420984626, "reward": 1.821643590927124, "reward_std": 0.20743662118911743, "rewards/DiagnosisAccuracyORM/mean": 0.5062500238418579, "rewards/DiagnosisAccuracyORM/std": 0.333344429731369, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3153935372829437, "rewards/KeyDiagnosticEvidenceORM/std": 0.11665956676006317, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/mean_length": 812.9375, "completions/min_length": 670.0, "entropy/max": 0.28125, "entropy/mean": 0.21630859375, "entropy/min": 0.13720703125, "epoch": 0.4259634888438134, "frac_reward_zero_std": 0.0, "grad_norm": 0.14572183787822723, "learning_rate": 6.242202307011639e-07, "loss": 0.015557711943984032, "reward": 1.6986773014068604, "reward_std": 0.1858595311641693, "rewards/DiagnosisAccuracyORM/mean": 0.40064486861228943, "rewards/DiagnosisAccuracyORM/std": 0.27494773268699646, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29803240299224854, "rewards/KeyDiagnosticEvidenceORM/std": 0.10546213388442993, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/mean_length": 833.9583740234375, "completions/min_length": 647.0, "entropy/max": 0.3779296875, "entropy/mean": 0.23388671875, "entropy/min": 0.129150390625, "epoch": 0.4279918864097363, "frac_reward_zero_std": 0.0, "grad_norm": 0.14939194917678833, "learning_rate": 6.210997531347877e-07, "loss": -0.015970595180988312, "reward": 1.890608549118042, "reward_std": 0.19762387871742249, "rewards/DiagnosisAccuracyORM/mean": 0.5514881014823914, "rewards/DiagnosisAccuracyORM/std": 0.2740394175052643, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33912038803100586, "rewards/KeyDiagnosticEvidenceORM/std": 0.13894304633140564, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/mean_length": 794.2916870117188, "completions/min_length": 679.0, "entropy/max": 0.25439453125, "entropy/mean": 0.19140625, "entropy/min": 0.13916015625, "epoch": 0.4300202839756592, "frac_reward_zero_std": 0.0, "grad_norm": 0.12838657200336456, "learning_rate": 6.179742567456463e-07, "loss": -0.003983841743320227, "reward": 1.8594245910644531, "reward_std": 0.24345755577087402, "rewards/DiagnosisAccuracyORM/mean": 0.5469245910644531, "rewards/DiagnosisAccuracyORM/std": 0.30959704518318176, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3125, "rewards/KeyDiagnosticEvidenceORM/std": 0.12040066719055176, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/mean_length": 818.4375, "completions/min_length": 667.0, "entropy/max": 0.3662109375, "entropy/mean": 0.2412109375, "entropy/min": 0.15576171875, "epoch": 0.43204868154158216, "frac_reward_zero_std": 0.0, "grad_norm": 0.13366226851940155, "learning_rate": 6.148438710658978e-07, "loss": 0.001801932929083705, "reward": 1.798958420753479, "reward_std": 0.20405817031860352, "rewards/DiagnosisAccuracyORM/mean": 0.44826388359069824, "rewards/DiagnosisAccuracyORM/std": 0.3665738105773926, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3506944179534912, "rewards/KeyDiagnosticEvidenceORM/std": 0.1475018560886383, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/mean_length": 821.1875, "completions/min_length": 634.0, "entropy/max": 0.408203125, "entropy/mean": 0.20703125, "entropy/min": 0.121337890625, "epoch": 0.4340770791075051, "frac_reward_zero_std": 0.0, "grad_norm": 0.12716886401176453, "learning_rate": 6.117087258303313e-07, "loss": 0.00400089006870985, "reward": 1.7840442657470703, "reward_std": 0.26192429661750793, "rewards/DiagnosisAccuracyORM/mean": 0.47559523582458496, "rewards/DiagnosisAccuracyORM/std": 0.3134254515171051, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30844905972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.13441245257854462, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 835.125, "completions/min_length": 681.0, "entropy/max": 0.291015625, "entropy/mean": 0.22119140625, "entropy/min": 0.154296875, "epoch": 0.43610547667342797, "frac_reward_zero_std": 0.0, "grad_norm": 0.11957193911075592, "learning_rate": 6.085689509709892e-07, "loss": 0.004263155162334442, "reward": 1.9137401580810547, "reward_std": 0.2521997392177582, "rewards/DiagnosisAccuracyORM/mean": 0.5995039343833923, "rewards/DiagnosisAccuracyORM/std": 0.25079405307769775, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3142361342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.11069013923406601, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/mean_length": 811.7708740234375, "completions/min_length": 640.0, "entropy/max": 0.4267578125, "entropy/mean": 0.25537109375, "entropy/min": 0.14013671875, "epoch": 0.4381338742393509, "frac_reward_zero_std": 0.0, "grad_norm": 0.15004689991474152, "learning_rate": 6.054246766117832e-07, "loss": 0.016996361315250397, "reward": 1.7369379997253418, "reward_std": 0.18725353479385376, "rewards/DiagnosisAccuracyORM/mean": 0.4516369104385376, "rewards/DiagnosisAccuracyORM/std": 0.3250420093536377, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2853009104728699, "rewards/KeyDiagnosticEvidenceORM/std": 0.15222492814064026, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/mean_length": 799.375, "completions/min_length": 596.0, "entropy/max": 0.326171875, "entropy/mean": 0.2109375, "entropy/min": 0.14990234375, "epoch": 0.44016227180527384, "frac_reward_zero_std": 0.0, "grad_norm": 0.13948994874954224, "learning_rate": 6.022760330631005e-07, "loss": 0.009545564651489258, "reward": 1.8914188146591187, "reward_std": 0.22861593961715698, "rewards/DiagnosisAccuracyORM/mean": 0.5094742178916931, "rewards/DiagnosisAccuracyORM/std": 0.29092907905578613, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3819444179534912, "rewards/KeyDiagnosticEvidenceORM/std": 0.1485968828201294, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/mean_length": 818.4166870117188, "completions/min_length": 670.0, "entropy/max": 0.306640625, "entropy/mean": 0.21923828125, "entropy/min": 0.15185546875, "epoch": 0.4421906693711968, "frac_reward_zero_std": 0.0, "grad_norm": 0.11719366163015366, "learning_rate": 5.991231508164036e-07, "loss": -0.0063331699930131435, "reward": 1.8545138835906982, "reward_std": 0.17494502663612366, "rewards/DiagnosisAccuracyORM/mean": 0.519444465637207, "rewards/DiagnosisAccuracyORM/std": 0.2542855739593506, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3350694477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.1324644386768341, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/mean_length": 813.7291870117188, "completions/min_length": 722.0, "entropy/max": 0.3837890625, "entropy/mean": 0.21923828125, "entropy/min": 0.146484375, "epoch": 0.44421906693711966, "frac_reward_zero_std": 0.0, "grad_norm": 0.11404789984226227, "learning_rate": 5.959661605388229e-07, "loss": 0.0063425577245652676, "reward": 1.6416254043579102, "reward_std": 0.22104883193969727, "rewards/DiagnosisAccuracyORM/mean": 0.37831512093544006, "rewards/DiagnosisAccuracyORM/std": 0.25675860047340393, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26331019401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.14496061205863953, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/mean_length": 807.1666870117188, "completions/min_length": 648.0, "entropy/max": 0.2841796875, "entropy/mean": 0.2001953125, "entropy/min": 0.14697265625, "epoch": 0.4462474645030426, "frac_reward_zero_std": 0.0, "grad_norm": 0.12464037537574768, "learning_rate": 5.928051930677404e-07, "loss": -0.014831995591521263, "reward": 1.5657904148101807, "reward_std": 0.15055440366268158, "rewards/DiagnosisAccuracyORM/mean": 0.24171626567840576, "rewards/DiagnosisAccuracyORM/std": 0.21493765711784363, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32407405972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.15228895843029022, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/mean_length": 815.375, "completions/min_length": 661.0, "entropy/max": 0.302734375, "entropy/mean": 0.2021484375, "entropy/min": 0.1318359375, "epoch": 0.4482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.11437603831291199, "learning_rate": 5.896403794053678e-07, "loss": 0.010314693674445152, "reward": 1.851388931274414, "reward_std": 0.19065804779529572, "rewards/DiagnosisAccuracyORM/mean": 0.5006945133209229, "rewards/DiagnosisAccuracyORM/std": 0.2816692590713501, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3506944179534912, "rewards/KeyDiagnosticEvidenceORM/std": 0.13866707682609558, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/mean_length": 832.8958740234375, "completions/min_length": 664.0, "entropy/max": 0.36328125, "entropy/mean": 0.21923828125, "entropy/min": 0.14697265625, "epoch": 0.45030425963488846, "frac_reward_zero_std": 0.0, "grad_norm": 0.11886170506477356, "learning_rate": 5.864718507133175e-07, "loss": 0.005942851305007935, "reward": 1.753257393836975, "reward_std": 0.24226181209087372, "rewards/DiagnosisAccuracyORM/mean": 0.4618799686431885, "rewards/DiagnosisAccuracyORM/std": 0.3329581022262573, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29137730598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.13412299752235413, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/mean_length": 801.7708740234375, "completions/min_length": 676.0, "entropy/max": 0.234375, "entropy/mean": 0.18212890625, "entropy/min": 0.14697265625, "epoch": 0.45233265720081134, "frac_reward_zero_std": 0.0, "grad_norm": 0.11673898994922638, "learning_rate": 5.832997383071659e-07, "loss": 0.00044934204197488725, "reward": 1.7204365730285645, "reward_std": 0.24658861756324768, "rewards/DiagnosisAccuracyORM/mean": 0.41140875220298767, "rewards/DiagnosisAccuracyORM/std": 0.27342620491981506, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3090277910232544, "rewards/KeyDiagnosticEvidenceORM/std": 0.12562230229377747, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/mean_length": 819.4166870117188, "completions/min_length": 685.0, "entropy/max": 0.3564453125, "entropy/mean": 0.22802734375, "entropy/min": 0.14599609375, "epoch": 0.4543610547667343, "frac_reward_zero_std": 0.0, "grad_norm": 0.16343778371810913, "learning_rate": 5.801241736510128e-07, "loss": -0.008004551753401756, "reward": 1.6852431297302246, "reward_std": 0.18778465688228607, "rewards/DiagnosisAccuracyORM/mean": 0.42656251788139343, "rewards/DiagnosisAccuracyORM/std": 0.2783171534538269, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2586805522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.13379013538360596, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/mean_length": 785.7708740234375, "completions/min_length": 697.0, "entropy/max": 0.3818359375, "entropy/mean": 0.19921875, "entropy/min": 0.140625, "epoch": 0.4563894523326572, "frac_reward_zero_std": 0.0, "grad_norm": 0.1319209635257721, "learning_rate": 5.769452883520309e-07, "loss": -0.00914886873215437, "reward": 1.703918695449829, "reward_std": 0.14550334215164185, "rewards/DiagnosisAccuracyORM/mean": 0.4244047701358795, "rewards/DiagnosisAccuracyORM/std": 0.25304511189460754, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2795138955116272, "rewards/KeyDiagnosticEvidenceORM/std": 0.13601110875606537, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/mean_length": 827.7916870117188, "completions/min_length": 660.0, "entropy/max": 0.328125, "entropy/mean": 0.2021484375, "entropy/min": 0.14208984375, "epoch": 0.45841784989858014, "frac_reward_zero_std": 0.0, "grad_norm": 0.12122435122728348, "learning_rate": 5.737632141550135e-07, "loss": 0.006807416677474976, "reward": 1.6851190328598022, "reward_std": 0.22897762060165405, "rewards/DiagnosisAccuracyORM/mean": 0.46810516715049744, "rewards/DiagnosisAccuracyORM/std": 0.30332648754119873, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2170138955116272, "rewards/KeyDiagnosticEvidenceORM/std": 0.10034355521202087, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/mean_length": 816.5208740234375, "completions/min_length": 654.0, "entropy/max": 0.2900390625, "entropy/mean": 0.1962890625, "entropy/min": 0.13427734375, "epoch": 0.460446247464503, "frac_reward_zero_std": 0.0, "grad_norm": 0.11456945538520813, "learning_rate": 5.70578082936913e-07, "loss": 0.0004285623726900667, "reward": 1.8325233459472656, "reward_std": 0.17910043895244598, "rewards/DiagnosisAccuracyORM/mean": 0.5003471970558167, "rewards/DiagnosisAccuracyORM/std": 0.26884064078330994, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33217594027519226, "rewards/KeyDiagnosticEvidenceORM/std": 0.12275811284780502, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/mean_length": 805.9791870117188, "completions/min_length": 666.0, "entropy/max": 0.47265625, "entropy/mean": 0.2275390625, "entropy/min": 0.1240234375, "epoch": 0.46247464503042596, "frac_reward_zero_std": 0.0, "grad_norm": 0.10548324137926102, "learning_rate": 5.673900267013769e-07, "loss": -0.01022819709032774, "reward": 1.7168898582458496, "reward_std": 0.12610270082950592, "rewards/DiagnosisAccuracyORM/mean": 0.4234870970249176, "rewards/DiagnosisAccuracyORM/std": 0.308651328086853, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.293402761220932, "rewards/KeyDiagnosticEvidenceORM/std": 0.211619570851326, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/mean_length": 794.3541870117188, "completions/min_length": 681.0, "entropy/max": 0.28662109375, "entropy/mean": 0.19287109375, "entropy/min": 0.14697265625, "epoch": 0.4645030425963489, "frac_reward_zero_std": 0.0, "grad_norm": 0.1195535659790039, "learning_rate": 5.641991775732755e-07, "loss": 0.0007696127286180854, "reward": 1.816840410232544, "reward_std": 0.1955527812242508, "rewards/DiagnosisAccuracyORM/mean": 0.5078125, "rewards/DiagnosisAccuracyORM/std": 0.24747785925865173, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3090277910232544, "rewards/KeyDiagnosticEvidenceORM/std": 0.14378201961517334, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/mean_length": 822.4583740234375, "completions/min_length": 646.0, "entropy/max": 0.390625, "entropy/mean": 0.228515625, "entropy/min": 0.15283203125, "epoch": 0.4665314401622718, "frac_reward_zero_std": 0.0, "grad_norm": 0.12858031690120697, "learning_rate": 5.610056677932273e-07, "loss": 0.004146588500589132, "reward": 1.5943617820739746, "reward_std": 0.14547646045684814, "rewards/DiagnosisAccuracyORM/mean": 0.3050099313259125, "rewards/DiagnosisAccuracyORM/std": 0.20634052157402039, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28935185074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.08379175513982773, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/mean_length": 812.7083740234375, "completions/min_length": 698.0, "entropy/max": 0.2861328125, "entropy/mean": 0.19287109375, "entropy/min": 0.134765625, "epoch": 0.4685598377281947, "frac_reward_zero_std": 0.0, "grad_norm": 0.13214130699634552, "learning_rate": 5.578096297121178e-07, "loss": 0.006542464252561331, "reward": 1.6462962627410889, "reward_std": 0.2211487889289856, "rewards/DiagnosisAccuracyORM/mean": 0.39803242683410645, "rewards/DiagnosisAccuracyORM/std": 0.2867772579193115, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.248263880610466, "rewards/KeyDiagnosticEvidenceORM/std": 0.13209211826324463, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/mean_length": 822.125, "completions/min_length": 673.0, "entropy/max": 0.3564453125, "entropy/mean": 0.2109375, "entropy/min": 0.13916015625, "epoch": 0.47058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.13563752174377441, "learning_rate": 5.546111957856154e-07, "loss": 0.0010186657309532166, "reward": 1.8459491729736328, "reward_std": 0.24801650643348694, "rewards/DiagnosisAccuracyORM/mean": 0.5027777552604675, "rewards/DiagnosisAccuracyORM/std": 0.275267630815506, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34317126870155334, "rewards/KeyDiagnosticEvidenceORM/std": 0.14524348080158234, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/mean_length": 818.25, "completions/min_length": 638.0, "entropy/max": 0.3056640625, "entropy/mean": 0.21923828125, "entropy/min": 0.12451171875, "epoch": 0.4726166328600406, "frac_reward_zero_std": 0.0, "grad_norm": 0.14274072647094727, "learning_rate": 5.514104985686801e-07, "loss": -0.006074508186429739, "reward": 1.7311344146728516, "reward_std": 0.24650467932224274, "rewards/DiagnosisAccuracyORM/mean": 0.47361111640930176, "rewards/DiagnosisAccuracyORM/std": 0.30629396438598633, "rewards/FormatRewardORM/mean": 0.9791666865348816, "rewards/FormatRewardORM/std": 0.14433756470680237, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2783564627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.1305825263261795, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/mean_length": 822.3125, "completions/min_length": 621.0, "entropy/max": 0.3154296875, "entropy/mean": 0.21044921875, "entropy/min": 0.13671875, "epoch": 0.4746450304259635, "frac_reward_zero_std": 0.0, "grad_norm": 0.1248089000582695, "learning_rate": 5.482076707100722e-07, "loss": 0.01107122004032135, "reward": 1.751124382019043, "reward_std": 0.2488340437412262, "rewards/DiagnosisAccuracyORM/mean": 0.4397817850112915, "rewards/DiagnosisAccuracyORM/std": 0.31497839093208313, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31134259700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.13253284990787506, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/mean_length": 805.6458740234375, "completions/min_length": 660.0, "entropy/max": 0.419921875, "entropy/mean": 0.23876953125, "entropy/min": 0.146484375, "epoch": 0.4766734279918864, "frac_reward_zero_std": 0.0, "grad_norm": 0.13866344094276428, "learning_rate": 5.450028449468526e-07, "loss": -0.006702361162751913, "reward": 1.6890708208084106, "reward_std": 0.2206643521785736, "rewards/DiagnosisAccuracyORM/mean": 0.41476520895957947, "rewards/DiagnosisAccuracyORM/std": 0.2656435966491699, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2743055522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.11193914711475372, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/mean_length": 809.1666870117188, "completions/min_length": 693.0, "entropy/max": 0.302734375, "entropy/mean": 0.1982421875, "entropy/min": 0.13037109375, "epoch": 0.4787018255578093, "frac_reward_zero_std": 0.0, "grad_norm": 0.1187082976102829, "learning_rate": 5.417961540988835e-07, "loss": 0.006172160618007183, "reward": 1.9412038326263428, "reward_std": 0.19911371171474457, "rewards/DiagnosisAccuracyORM/mean": 0.6107639074325562, "rewards/DiagnosisAccuracyORM/std": 0.314005047082901, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33043983578681946, "rewards/KeyDiagnosticEvidenceORM/std": 0.1142062321305275, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/mean_length": 812.0833740234375, "completions/min_length": 687.0, "entropy/max": 0.296875, "entropy/mean": 0.2021484375, "entropy/min": 0.13525390625, "epoch": 0.48073022312373226, "frac_reward_zero_std": 0.0, "grad_norm": 0.11957315355539322, "learning_rate": 5.385877310633232e-07, "loss": 0.005786076188087463, "reward": 1.723512053489685, "reward_std": 0.1745818853378296, "rewards/DiagnosisAccuracyORM/mean": 0.4249007999897003, "rewards/DiagnosisAccuracyORM/std": 0.21797175705432892, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2986111044883728, "rewards/KeyDiagnosticEvidenceORM/std": 0.09266152232885361, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/mean_length": 823.1458740234375, "completions/min_length": 640.0, "entropy/max": 0.3134765625, "entropy/mean": 0.224609375, "entropy/min": 0.14453125, "epoch": 0.4827586206896552, "frac_reward_zero_std": 0.0, "grad_norm": 0.13214048743247986, "learning_rate": 5.353777088091177e-07, "loss": -0.00835415255278349, "reward": 1.7976852655410767, "reward_std": 0.23752912878990173, "rewards/DiagnosisAccuracyORM/mean": 0.5222222208976746, "rewards/DiagnosisAccuracyORM/std": 0.3288768231868744, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27546295523643494, "rewards/KeyDiagnosticEvidenceORM/std": 0.11182297766208649, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/mean_length": 808.4166870117188, "completions/min_length": 632.0, "entropy/max": 0.3466796875, "entropy/mean": 0.220703125, "entropy/min": 0.1357421875, "epoch": 0.4847870182555781, "frac_reward_zero_std": 0.0, "grad_norm": 0.1355971395969391, "learning_rate": 5.321662203714908e-07, "loss": 0.012185007333755493, "reward": 1.7112269401550293, "reward_std": 0.1939890831708908, "rewards/DiagnosisAccuracyORM/mean": 0.40625, "rewards/DiagnosisAccuracyORM/std": 0.24251501262187958, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30497685074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.1274213343858719, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/mean_length": 815.4375, "completions/min_length": 639.0, "entropy/max": 0.326171875, "entropy/mean": 0.23583984375, "entropy/min": 0.16552734375, "epoch": 0.486815415821501, "frac_reward_zero_std": 0.0, "grad_norm": 0.14027869701385498, "learning_rate": 5.289533988464307e-07, "loss": -0.008561035618185997, "reward": 1.7124090194702148, "reward_std": 0.18831580877304077, "rewards/DiagnosisAccuracyORM/mean": 0.4224785268306732, "rewards/DiagnosisAccuracyORM/std": 0.25018182396888733, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2899305820465088, "rewards/KeyDiagnosticEvidenceORM/std": 0.13846345245838165, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/mean_length": 795.1875, "completions/min_length": 670.0, "entropy/max": 0.2890625, "entropy/mean": 0.20751953125, "entropy/min": 0.1611328125, "epoch": 0.48884381338742394, "frac_reward_zero_std": 0.0, "grad_norm": 0.10517430305480957, "learning_rate": 5.257393773851733e-07, "loss": -0.01672135479748249, "reward": 1.6142361164093018, "reward_std": 0.11844108998775482, "rewards/DiagnosisAccuracyORM/mean": 0.36944445967674255, "rewards/DiagnosisAccuracyORM/std": 0.3019026517868042, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2447916716337204, "rewards/KeyDiagnosticEvidenceORM/std": 0.10851824283599854, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/mean_length": 818.5625, "completions/min_length": 702.0, "entropy/max": 0.310546875, "entropy/mean": 0.19140625, "entropy/min": 0.11328125, "epoch": 0.4908722109533469, "frac_reward_zero_std": 0.0, "grad_norm": 0.11923424154520035, "learning_rate": 5.225242891886844e-07, "loss": 0.016617614775896072, "reward": 1.8232638835906982, "reward_std": 0.12711426615715027, "rewards/DiagnosisAccuracyORM/mean": 0.48819446563720703, "rewards/DiagnosisAccuracyORM/std": 0.3400903642177582, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3350694179534912, "rewards/KeyDiagnosticEvidenceORM/std": 0.1644260585308075, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/mean_length": 805.5625, "completions/min_length": 597.0, "entropy/max": 0.2841796875, "entropy/mean": 0.21240234375, "entropy/min": 0.12353515625, "epoch": 0.49290060851926976, "frac_reward_zero_std": 0.0, "grad_norm": 0.12442566454410553, "learning_rate": 5.193082675021392e-07, "loss": -0.009736260399222374, "reward": 1.8031251430511475, "reward_std": 0.18705594539642334, "rewards/DiagnosisAccuracyORM/mean": 0.5392361283302307, "rewards/DiagnosisAccuracyORM/std": 0.24717050790786743, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2638888955116272, "rewards/KeyDiagnosticEvidenceORM/std": 0.08479800075292587, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/mean_length": 834.1041870117188, "completions/min_length": 713.0, "entropy/max": 0.3359375, "entropy/mean": 0.232421875, "entropy/min": 0.1650390625, "epoch": 0.4949290060851927, "frac_reward_zero_std": 0.0, "grad_norm": 0.13844868540763855, "learning_rate": 5.160914456094004e-07, "loss": 0.004053813870996237, "reward": 1.7599289417266846, "reward_std": 0.18608643114566803, "rewards/DiagnosisAccuracyORM/mean": 0.4216766059398651, "rewards/DiagnosisAccuracyORM/std": 0.31881433725357056, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33825230598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.1414216309785843, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/mean_length": 833.6875, "completions/min_length": 693.0, "entropy/max": 0.365234375, "entropy/mean": 0.21923828125, "entropy/min": 0.1337890625, "epoch": 0.4969574036511156, "frac_reward_zero_std": 0.0, "grad_norm": 0.14686664938926697, "learning_rate": 5.128739568274943e-07, "loss": -0.011822971515357494, "reward": 1.7312666177749634, "reward_std": 0.2069929540157318, "rewards/DiagnosisAccuracyORM/mean": 0.43149805068969727, "rewards/DiagnosisAccuracyORM/std": 0.3122752904891968, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29976850748062134, "rewards/KeyDiagnosticEvidenceORM/std": 0.14720474183559418, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/mean_length": 822.9583740234375, "completions/min_length": 653.0, "entropy/max": 0.318359375, "entropy/mean": 0.22607421875, "entropy/min": 0.1513671875, "epoch": 0.49898580121703856, "frac_reward_zero_std": 0.0, "grad_norm": 0.11073679476976395, "learning_rate": 5.096559345010849e-07, "loss": -0.013353481888771057, "reward": 1.84675931930542, "reward_std": 0.2411099076271057, "rewards/DiagnosisAccuracyORM/mean": 0.5666666626930237, "rewards/DiagnosisAccuracyORM/std": 0.359964519739151, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2800925672054291, "rewards/KeyDiagnosticEvidenceORM/std": 0.11529266834259033, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/mean_length": 814.0, "completions/min_length": 641.0, "entropy/max": 0.3271484375, "entropy/mean": 0.21484375, "entropy/min": 0.1259765625, "epoch": 0.5010141987829615, "frac_reward_zero_std": 0.0, "grad_norm": 0.12771286070346832, "learning_rate": 5.06437511996949e-07, "loss": 0.011068063788115978, "reward": 1.9831019639968872, "reward_std": 0.1710605025291443, "rewards/DiagnosisAccuracyORM/mean": 0.6121527552604675, "rewards/DiagnosisAccuracyORM/std": 0.1795956790447235, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.37094905972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.14088280498981476, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/mean_length": 803.0, "completions/min_length": 682.0, "entropy/max": 0.40625, "entropy/mean": 0.21875, "entropy/min": 0.1572265625, "epoch": 0.5030425963488844, "frac_reward_zero_std": 0.0, "grad_norm": 0.12768423557281494, "learning_rate": 5.032188226984479e-07, "loss": -0.007847702130675316, "reward": 1.7670469284057617, "reward_std": 0.26146644353866577, "rewards/DiagnosisAccuracyORM/mean": 0.44181546568870544, "rewards/DiagnosisAccuracyORM/std": 0.3051162660121918, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3252314627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.13203643262386322, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/mean_length": 807.3958740234375, "completions/min_length": 662.0, "entropy/max": 0.4033203125, "entropy/mean": 0.20703125, "entropy/min": 0.11962890625, "epoch": 0.5050709939148073, "frac_reward_zero_std": 0.0, "grad_norm": 0.1107858344912529, "learning_rate": 5e-07, "loss": -0.009475549682974815, "reward": 1.501322865486145, "reward_std": 0.1313786506652832, "rewards/DiagnosisAccuracyORM/mean": 0.2750495970249176, "rewards/DiagnosisAccuracyORM/std": 0.20254330337047577, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.22627313435077667, "rewards/KeyDiagnosticEvidenceORM/std": 0.13764090836048126, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/mean_length": 832.7083740234375, "completions/min_length": 697.0, "entropy/max": 0.46484375, "entropy/mean": 0.23486328125, "entropy/min": 0.16943359375, "epoch": 0.5070993914807302, "frac_reward_zero_std": 0.0, "grad_norm": 0.14636261761188507, "learning_rate": 4.967811773015522e-07, "loss": 0.013171231374144554, "reward": 1.7853009700775146, "reward_std": 0.17803552746772766, "rewards/DiagnosisAccuracyORM/mean": 0.5277777910232544, "rewards/DiagnosisAccuracyORM/std": 0.33367595076560974, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25752314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.12124848365783691, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/mean_length": 805.8125, "completions/min_length": 603.0, "entropy/max": 0.4091796875, "entropy/mean": 0.216796875, "entropy/min": 0.15185546875, "epoch": 0.5091277890466531, "frac_reward_zero_std": 0.0, "grad_norm": 0.14045734703540802, "learning_rate": 4.93562488003051e-07, "loss": -0.007931552827358246, "reward": 1.826157569885254, "reward_std": 0.258148729801178, "rewards/DiagnosisAccuracyORM/mean": 0.519444465637207, "rewards/DiagnosisAccuracyORM/std": 0.28437092900276184, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3067129850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.13929706811904907, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/mean_length": 815.6458740234375, "completions/min_length": 692.0, "entropy/max": 0.283203125, "entropy/mean": 0.197265625, "entropy/min": 0.14599609375, "epoch": 0.5111561866125761, "frac_reward_zero_std": 0.0, "grad_norm": 0.11359331011772156, "learning_rate": 4.90344065498915e-07, "loss": -0.011832071468234062, "reward": 1.7668981552124023, "reward_std": 0.21100163459777832, "rewards/DiagnosisAccuracyORM/mean": 0.3843750059604645, "rewards/DiagnosisAccuracyORM/std": 0.24140001833438873, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.38252314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.15471208095550537, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/mean_length": 834.5, "completions/min_length": 682.0, "entropy/max": 0.41796875, "entropy/mean": 0.220703125, "entropy/min": 0.150390625, "epoch": 0.513184584178499, "frac_reward_zero_std": 0.0, "grad_norm": 0.13250596821308136, "learning_rate": 4.871260431725058e-07, "loss": 0.0023184046149253845, "reward": 1.7952795028686523, "reward_std": 0.17332103848457336, "rewards/DiagnosisAccuracyORM/mean": 0.5007192492485046, "rewards/DiagnosisAccuracyORM/std": 0.3142906725406647, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29456016421318054, "rewards/KeyDiagnosticEvidenceORM/std": 0.11195594817399979, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/mean_length": 822.8541870117188, "completions/min_length": 654.0, "entropy/max": 0.353515625, "entropy/mean": 0.2236328125, "entropy/min": 0.14794921875, "epoch": 0.5152129817444219, "frac_reward_zero_std": 0.0, "grad_norm": 0.1388242542743683, "learning_rate": 4.839085543905995e-07, "loss": -0.018161188811063766, "reward": 1.7572420835494995, "reward_std": 0.19117185473442078, "rewards/DiagnosisAccuracyORM/mean": 0.4482142925262451, "rewards/DiagnosisAccuracyORM/std": 0.3553553819656372, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.309027761220932, "rewards/KeyDiagnosticEvidenceORM/std": 0.11075964570045471, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/mean_length": 815.2291870117188, "completions/min_length": 622.0, "entropy/max": 0.427734375, "entropy/mean": 0.24072265625, "entropy/min": 0.15185546875, "epoch": 0.5172413793103449, "frac_reward_zero_std": 0.0, "grad_norm": 0.15044096112251282, "learning_rate": 4.806917324978607e-07, "loss": -0.00707803200930357, "reward": 1.715120792388916, "reward_std": 0.26487791538238525, "rewards/DiagnosisAccuracyORM/mean": 0.4211392402648926, "rewards/DiagnosisAccuracyORM/std": 0.25310665369033813, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2939814627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.14138807356357574, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/mean_length": 805.2083740234375, "completions/min_length": 654.0, "entropy/max": 0.2578125, "entropy/mean": 0.1904296875, "entropy/min": 0.13330078125, "epoch": 0.5192697768762677, "frac_reward_zero_std": 0.0, "grad_norm": 0.1140906810760498, "learning_rate": 4.774757108113155e-07, "loss": 0.0006611148710362613, "reward": 1.6974537372589111, "reward_std": 0.1709212064743042, "rewards/DiagnosisAccuracyORM/mean": 0.3756944239139557, "rewards/DiagnosisAccuracyORM/std": 0.2836972773075104, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32175925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.1043815091252327, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/mean_length": 811.3958740234375, "completions/min_length": 662.0, "entropy/max": 0.3203125, "entropy/mean": 0.2041015625, "entropy/min": 0.15576171875, "epoch": 0.5212981744421906, "frac_reward_zero_std": 0.0, "grad_norm": 0.1265082210302353, "learning_rate": 4.742606226148267e-07, "loss": -0.016248105093836784, "reward": 1.852099895477295, "reward_std": 0.29790717363357544, "rewards/DiagnosisAccuracyORM/mean": 0.510664701461792, "rewards/DiagnosisAccuracyORM/std": 0.3154965043067932, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34143519401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.13523058593273163, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/mean_length": 806.9375, "completions/min_length": 671.0, "entropy/max": 0.25390625, "entropy/mean": 0.1904296875, "entropy/min": 0.12939453125, "epoch": 0.5233265720081136, "frac_reward_zero_std": 0.0, "grad_norm": 0.10514168441295624, "learning_rate": 4.710466011535694e-07, "loss": -0.002885202644392848, "reward": 1.6628308296203613, "reward_std": 0.15254420042037964, "rewards/DiagnosisAccuracyORM/mean": 0.4070436656475067, "rewards/DiagnosisAccuracyORM/std": 0.26626285910606384, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25578704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.12501093745231628, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/mean_length": 813.0416870117188, "completions/min_length": 634.0, "entropy/max": 0.28466796875, "entropy/mean": 0.1982421875, "entropy/min": 0.12255859375, "epoch": 0.5253549695740365, "frac_reward_zero_std": 0.0, "grad_norm": 0.11158531159162521, "learning_rate": 4.6783377962850917e-07, "loss": 0.010287180542945862, "reward": 1.8068121671676636, "reward_std": 0.21464860439300537, "rewards/DiagnosisAccuracyORM/mean": 0.5012566447257996, "rewards/DiagnosisAccuracyORM/std": 0.3301968574523926, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3055555522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.12902311980724335, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/mean_length": 812.125, "completions/min_length": 699.0, "entropy/max": 0.4599609375, "entropy/mean": 0.22998046875, "entropy/min": 0.1611328125, "epoch": 0.5273833671399595, "frac_reward_zero_std": 0.0, "grad_norm": 0.14489667117595673, "learning_rate": 4.6462229119088225e-07, "loss": 0.010617135092616081, "reward": 1.6290509700775146, "reward_std": 0.19265002012252808, "rewards/DiagnosisAccuracyORM/mean": 0.3072916567325592, "rewards/DiagnosisAccuracyORM/std": 0.2838118374347687, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32175925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.14045609533786774, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/mean_length": 826.5208740234375, "completions/min_length": 728.0, "entropy/max": 0.4189453125, "entropy/mean": 0.22607421875, "entropy/min": 0.1435546875, "epoch": 0.5294117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 0.13446229696273804, "learning_rate": 4.614122689366768e-07, "loss": -0.006802251096814871, "reward": 1.8215278387069702, "reward_std": 0.27654460072517395, "rewards/DiagnosisAccuracyORM/mean": 0.5385417342185974, "rewards/DiagnosisAccuracyORM/std": 0.3033141493797302, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2829861342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.13299012184143066, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/mean_length": 806.6041870117188, "completions/min_length": 691.0, "entropy/max": 0.283203125, "entropy/mean": 0.19970703125, "entropy/min": 0.13720703125, "epoch": 0.5314401622718052, "frac_reward_zero_std": 0.0, "grad_norm": 0.12408187985420227, "learning_rate": 4.5820384590111646e-07, "loss": 0.003564313054084778, "reward": 1.7697420120239258, "reward_std": 0.16368672251701355, "rewards/DiagnosisAccuracyORM/mean": 0.5145337581634521, "rewards/DiagnosisAccuracyORM/std": 0.24374167621135712, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2552083432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.12218141555786133, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/mean_length": 814.3125, "completions/min_length": 652.0, "entropy/max": 0.2802734375, "entropy/mean": 0.1953125, "entropy/min": 0.140625, "epoch": 0.5334685598377282, "frac_reward_zero_std": 0.0, "grad_norm": 0.12334045022726059, "learning_rate": 4.549971550531474e-07, "loss": -0.0020438856445252895, "reward": 1.836689829826355, "reward_std": 0.31288886070251465, "rewards/DiagnosisAccuracyORM/mean": 0.5045139193534851, "rewards/DiagnosisAccuracyORM/std": 0.30756306648254395, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33217594027519226, "rewards/KeyDiagnosticEvidenceORM/std": 0.12087131291627884, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/mean_length": 832.7291870117188, "completions/min_length": 687.0, "entropy/max": 0.322265625, "entropy/mean": 0.22509765625, "entropy/min": 0.146484375, "epoch": 0.5354969574036511, "frac_reward_zero_std": 0.0, "grad_norm": 0.13961857557296753, "learning_rate": 4.517923292899279e-07, "loss": 0.012723698280751705, "reward": 1.9398975372314453, "reward_std": 0.2233695387840271, "rewards/DiagnosisAccuracyORM/mean": 0.6158234477043152, "rewards/DiagnosisAccuracyORM/std": 0.25210240483283997, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3240740895271301, "rewards/KeyDiagnosticEvidenceORM/std": 0.1588321477174759, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/mean_length": 816.1041870117188, "completions/min_length": 607.0, "entropy/max": 0.376953125, "entropy/mean": 0.2421875, "entropy/min": 0.1513671875, "epoch": 0.537525354969574, "frac_reward_zero_std": 0.0, "grad_norm": 0.1474367380142212, "learning_rate": 4.485895014313197e-07, "loss": 0.013957245275378227, "reward": 1.7841931581497192, "reward_std": 0.23679807782173157, "rewards/DiagnosisAccuracyORM/mean": 0.5052579045295715, "rewards/DiagnosisAccuracyORM/std": 0.28024524450302124, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27893519401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.1411992758512497, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/mean_length": 807.2916870117188, "completions/min_length": 690.0, "entropy/max": 0.2822265625, "entropy/mean": 0.19921875, "entropy/min": 0.15625, "epoch": 0.539553752535497, "frac_reward_zero_std": 0.0, "grad_norm": 0.13020388782024384, "learning_rate": 4.453888042143846e-07, "loss": 0.0015042697777971625, "reward": 1.9242229461669922, "reward_std": 0.2137412130832672, "rewards/DiagnosisAccuracyORM/mean": 0.5706349015235901, "rewards/DiagnosisAccuracyORM/std": 0.3212592303752899, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35358795523643494, "rewards/KeyDiagnosticEvidenceORM/std": 0.10792408883571625, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/mean_length": 816.25, "completions/min_length": 663.0, "entropy/max": 0.255859375, "entropy/mean": 0.18798828125, "entropy/min": 0.12548828125, "epoch": 0.5415821501014199, "frac_reward_zero_std": 0.0, "grad_norm": 0.12353622913360596, "learning_rate": 4.4219037028788213e-07, "loss": -0.009854106232523918, "reward": 1.8939815759658813, "reward_std": 0.23670683801174164, "rewards/DiagnosisAccuracyORM/mean": 0.5791667103767395, "rewards/DiagnosisAccuracyORM/std": 0.30454593896865845, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31481480598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.11635573208332062, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/mean_length": 816.75, "completions/min_length": 683.0, "entropy/max": 0.4287109375, "entropy/mean": 0.212890625, "entropy/min": 0.124267578125, "epoch": 0.5436105476673428, "frac_reward_zero_std": 0.0, "grad_norm": 0.14320628345012665, "learning_rate": 4.389943322067728e-07, "loss": 0.0022720396518707275, "reward": 1.6789352893829346, "reward_std": 0.2505607008934021, "rewards/DiagnosisAccuracyORM/mean": 0.3722222149372101, "rewards/DiagnosisAccuracyORM/std": 0.3235320448875427, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30671295523643494, "rewards/KeyDiagnosticEvidenceORM/std": 0.09891024976968765, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/mean_length": 842.7291870117188, "completions/min_length": 686.0, "entropy/max": 0.3330078125, "entropy/mean": 0.23046875, "entropy/min": 0.1328125, "epoch": 0.5456389452332657, "frac_reward_zero_std": 0.0, "grad_norm": 0.12971369922161102, "learning_rate": 4.3580082242672444e-07, "loss": -0.001034493325278163, "reward": 1.9353010654449463, "reward_std": 0.25532543659210205, "rewards/DiagnosisAccuracyORM/mean": 0.6170138716697693, "rewards/DiagnosisAccuracyORM/std": 0.31376028060913086, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31828704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.12838001549243927, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 814.25, "completions/min_length": 632.0, "entropy/max": 0.400390625, "entropy/mean": 0.2109375, "entropy/min": 0.13330078125, "epoch": 0.5476673427991886, "frac_reward_zero_std": 0.0, "grad_norm": 0.12681198120117188, "learning_rate": 4.3260997329862307e-07, "loss": 0.008635997772216797, "reward": 1.926041841506958, "reward_std": 0.2615225911140442, "rewards/DiagnosisAccuracyORM/mean": 0.5336806178092957, "rewards/DiagnosisAccuracyORM/std": 0.34261006116867065, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3923611342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.1680034101009369, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/mean_length": 815.8125, "completions/min_length": 614.0, "entropy/max": 0.3173828125, "entropy/mean": 0.20263671875, "entropy/min": 0.1318359375, "epoch": 0.5496957403651116, "frac_reward_zero_std": 0.0, "grad_norm": 0.2014678716659546, "learning_rate": 4.29421917063087e-07, "loss": 0.0029678146820515394, "reward": 1.8104910850524902, "reward_std": 0.1830994039773941, "rewards/DiagnosisAccuracyORM/mean": 0.49625495076179504, "rewards/DiagnosisAccuracyORM/std": 0.31446900963783264, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3142361342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.13168758153915405, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/mean_length": 819.8541870117188, "completions/min_length": 685.0, "entropy/max": 0.30859375, "entropy/mean": 0.20263671875, "entropy/min": 0.1416015625, "epoch": 0.5517241379310345, "frac_reward_zero_std": 0.0, "grad_norm": 0.11601993441581726, "learning_rate": 4.2623678584498664e-07, "loss": -0.013217270374298096, "reward": 1.8641037940979004, "reward_std": 0.20894208550453186, "rewards/DiagnosisAccuracyORM/mean": 0.562599241733551, "rewards/DiagnosisAccuracyORM/std": 0.31526586413383484, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30150464177131653, "rewards/KeyDiagnosticEvidenceORM/std": 0.11502983421087265, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/mean_length": 795.4166870117188, "completions/min_length": 647.0, "entropy/max": 0.2646484375, "entropy/mean": 0.19677734375, "entropy/min": 0.124267578125, "epoch": 0.5537525354969574, "frac_reward_zero_std": 0.0, "grad_norm": 0.12758012115955353, "learning_rate": 4.2305471164796904e-07, "loss": -0.007597201969474554, "reward": 1.5870370864868164, "reward_std": 0.20956185460090637, "rewards/DiagnosisAccuracyORM/mean": 0.34687504172325134, "rewards/DiagnosisAccuracyORM/std": 0.2543627917766571, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24016205966472626, "rewards/KeyDiagnosticEvidenceORM/std": 0.13817660510540009, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/mean_length": 806.8541870117188, "completions/min_length": 671.0, "entropy/max": 0.2763671875, "entropy/mean": 0.20556640625, "entropy/min": 0.146484375, "epoch": 0.5557809330628803, "frac_reward_zero_std": 0.0, "grad_norm": 0.1253810077905655, "learning_rate": 4.1987582634898723e-07, "loss": 0.004457567818462849, "reward": 1.7879794836044312, "reward_std": 0.2357156127691269, "rewards/DiagnosisAccuracyORM/mean": 0.4887896776199341, "rewards/DiagnosisAccuracyORM/std": 0.27720907330513, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29918980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.10475928336381912, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/mean_length": 810.2916870117188, "completions/min_length": 674.0, "entropy/max": 0.283203125, "entropy/mean": 0.18701171875, "entropy/min": 0.13671875, "epoch": 0.5578093306288032, "frac_reward_zero_std": 0.0, "grad_norm": 0.1371307671070099, "learning_rate": 4.16700261692834e-07, "loss": -0.004473940934985876, "reward": 1.6885087490081787, "reward_std": 0.2611103355884552, "rewards/DiagnosisAccuracyORM/mean": 0.40089285373687744, "rewards/DiagnosisAccuracyORM/std": 0.3048844337463379, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28761574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.1269049197435379, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/mean_length": 805.3125, "completions/min_length": 655.0, "entropy/max": 0.2646484375, "entropy/mean": 0.19775390625, "entropy/min": 0.13720703125, "epoch": 0.5598377281947262, "frac_reward_zero_std": 0.0, "grad_norm": 0.12494851648807526, "learning_rate": 4.1352814928668257e-07, "loss": -0.01922827959060669, "reward": 1.7436342239379883, "reward_std": 0.23200295865535736, "rewards/DiagnosisAccuracyORM/mean": 0.4114583432674408, "rewards/DiagnosisAccuracyORM/std": 0.28087493777275085, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33217594027519226, "rewards/KeyDiagnosticEvidenceORM/std": 0.10534044355154037, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/mean_length": 811.0, "completions/min_length": 631.0, "entropy/max": 0.2880859375, "entropy/mean": 0.19287109375, "entropy/min": 0.130615234375, "epoch": 0.5618661257606491, "frac_reward_zero_std": 0.0, "grad_norm": 0.11801911890506744, "learning_rate": 4.1035962059463224e-07, "loss": -0.002080662874504924, "reward": 1.6683533191680908, "reward_std": 0.2583598494529724, "rewards/DiagnosisAccuracyORM/mean": 0.40446433424949646, "rewards/DiagnosisAccuracyORM/std": 0.27345651388168335, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2638888955116272, "rewards/KeyDiagnosticEvidenceORM/std": 0.12262988090515137, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/mean_length": 798.5625, "completions/min_length": 625.0, "entropy/max": 0.26171875, "entropy/mean": 0.19677734375, "entropy/min": 0.138671875, "epoch": 0.563894523326572, "frac_reward_zero_std": 0.0, "grad_norm": 0.11504043638706207, "learning_rate": 4.071948069322596e-07, "loss": 0.005844511091709137, "reward": 1.8547950983047485, "reward_std": 0.2077069729566574, "rewards/DiagnosisAccuracyORM/mean": 0.4983135163784027, "rewards/DiagnosisAccuracyORM/std": 0.3492884933948517, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3564814627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.12051425129175186, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/mean_length": 796.0208740234375, "completions/min_length": 700.0, "entropy/max": 0.30078125, "entropy/mean": 0.181640625, "entropy/min": 0.12939453125, "epoch": 0.565922920892495, "frac_reward_zero_std": 0.0, "grad_norm": 0.12033189833164215, "learning_rate": 4.0403383946117715e-07, "loss": 0.0020854524336755276, "reward": 1.7295138835906982, "reward_std": 0.19134342670440674, "rewards/DiagnosisAccuracyORM/mean": 0.40486112236976624, "rewards/DiagnosisAccuracyORM/std": 0.24884982407093048, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3246527910232544, "rewards/KeyDiagnosticEvidenceORM/std": 0.08424564450979233, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/mean_length": 791.1666870117188, "completions/min_length": 678.0, "entropy/max": 0.28759765625, "entropy/mean": 0.18896484375, "entropy/min": 0.12646484375, "epoch": 0.5679513184584178, "frac_reward_zero_std": 0.0, "grad_norm": 0.09916926175355911, "learning_rate": 4.008768491835964e-07, "loss": 0.005903822835534811, "reward": 1.8995784521102905, "reward_std": 0.19729483127593994, "rewards/DiagnosisAccuracyORM/mean": 0.5454117059707642, "rewards/DiagnosisAccuracyORM/std": 0.28046637773513794, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3541666567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.10999736934900284, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/mean_length": 809.75, "completions/min_length": 634.0, "entropy/max": 0.44921875, "entropy/mean": 0.22802734375, "entropy/min": 0.15234375, "epoch": 0.5699797160243407, "frac_reward_zero_std": 0.0, "grad_norm": 0.13756728172302246, "learning_rate": 3.977239669368997e-07, "loss": -0.009733237326145172, "reward": 1.7291004657745361, "reward_std": 0.26518601179122925, "rewards/DiagnosisAccuracyORM/mean": 0.36393848061561584, "rewards/DiagnosisAccuracyORM/std": 0.3307199776172638, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.36516204476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.21447713673114777, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/mean_length": 817.3541870117188, "completions/min_length": 659.0, "entropy/max": 0.3681640625, "entropy/mean": 0.2060546875, "entropy/min": 0.130859375, "epoch": 0.5720081135902637, "frac_reward_zero_std": 0.0, "grad_norm": 0.12078668922185898, "learning_rate": 3.9457532338821675e-07, "loss": -0.001948972581885755, "reward": 1.7555556297302246, "reward_std": 0.19148264825344086, "rewards/DiagnosisAccuracyORM/mean": 0.39618054032325745, "rewards/DiagnosisAccuracyORM/std": 0.27631568908691406, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.359375, "rewards/KeyDiagnosticEvidenceORM/std": 0.1280532330274582, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/mean_length": 826.0833740234375, "completions/min_length": 643.0, "entropy/max": 0.302734375, "entropy/mean": 0.203125, "entropy/min": 0.126220703125, "epoch": 0.5740365111561866, "frac_reward_zero_std": 0.0, "grad_norm": 0.12106510251760483, "learning_rate": 3.914310490290108e-07, "loss": 0.007928004488348961, "reward": 1.921875, "reward_std": 0.29876530170440674, "rewards/DiagnosisAccuracyORM/mean": 0.616319477558136, "rewards/DiagnosisAccuracyORM/std": 0.34517085552215576, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3055555522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.1301632523536682, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/mean_length": 798.3541870117188, "completions/min_length": 664.0, "entropy/max": 0.26318359375, "entropy/mean": 0.19189453125, "entropy/min": 0.14794921875, "epoch": 0.5760649087221096, "frac_reward_zero_std": 0.0, "grad_norm": 0.13656915724277496, "learning_rate": 3.882912741696688e-07, "loss": -0.027518410235643387, "reward": 1.9137731790542603, "reward_std": 0.28302934765815735, "rewards/DiagnosisAccuracyORM/mean": 0.5642361044883728, "rewards/DiagnosisAccuracyORM/std": 0.32224446535110474, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34953704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.1430044323205948, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/mean_length": 822.0416870117188, "completions/min_length": 671.0, "entropy/max": 0.45703125, "entropy/mean": 0.25048828125, "entropy/min": 0.14208984375, "epoch": 0.5780933062880325, "frac_reward_zero_std": 0.0, "grad_norm": 0.13617002964019775, "learning_rate": 3.8515612893410224e-07, "loss": -0.0005208788206800818, "reward": 1.6254795789718628, "reward_std": 0.17344996333122253, "rewards/DiagnosisAccuracyORM/mean": 0.34191468358039856, "rewards/DiagnosisAccuracyORM/std": 0.27112966775894165, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28356480598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.15105773508548737, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/mean_length": 791.2708740234375, "completions/min_length": 671.0, "entropy/max": 0.3388671875, "entropy/mean": 0.205078125, "entropy/min": 0.120849609375, "epoch": 0.5801217038539553, "frac_reward_zero_std": 0.0, "grad_norm": 0.1250656694173813, "learning_rate": 3.8202574325435383e-07, "loss": -0.0003554274735506624, "reward": 1.5887566804885864, "reward_std": 0.1469736397266388, "rewards/DiagnosisAccuracyORM/mean": 0.3844741880893707, "rewards/DiagnosisAccuracyORM/std": 0.2741755545139313, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.20428240299224854, "rewards/KeyDiagnosticEvidenceORM/std": 0.11118342727422714, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/mean_length": 831.875, "completions/min_length": 666.0, "entropy/max": 0.3505859375, "entropy/mean": 0.2353515625, "entropy/min": 0.16943359375, "epoch": 0.5821501014198783, "frac_reward_zero_std": 0.0, "grad_norm": 0.11452984809875488, "learning_rate": 3.7890024686521205e-07, "loss": 0.011754287406802177, "reward": 1.637351155281067, "reward_std": 0.25089502334594727, "rewards/DiagnosisAccuracyORM/mean": 0.39429566264152527, "rewards/DiagnosisAccuracyORM/std": 0.3270454704761505, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2430555671453476, "rewards/KeyDiagnosticEvidenceORM/std": 0.12946771085262299, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/mean_length": 815.625, "completions/min_length": 682.0, "entropy/max": 0.3115234375, "entropy/mean": 0.2109375, "entropy/min": 0.14990234375, "epoch": 0.5841784989858012, "frac_reward_zero_std": 0.0, "grad_norm": 0.14345578849315643, "learning_rate": 3.7577976929883603e-07, "loss": -0.0002662887272890657, "reward": 1.8389880657196045, "reward_std": 0.19075247645378113, "rewards/DiagnosisAccuracyORM/mean": 0.5004464387893677, "rewards/DiagnosisAccuracyORM/std": 0.2779156267642975, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3385416567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.13495081663131714, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/mean_length": 809.375, "completions/min_length": 642.0, "entropy/max": 0.3193359375, "entropy/mean": 0.20068359375, "entropy/min": 0.13037109375, "epoch": 0.5862068965517241, "frac_reward_zero_std": 0.0, "grad_norm": 0.1024932712316513, "learning_rate": 3.7266443987938565e-07, "loss": -0.008148193359375, "reward": 1.714599847793579, "reward_std": 0.16603457927703857, "rewards/DiagnosisAccuracyORM/mean": 0.43392857909202576, "rewards/DiagnosisAccuracyORM/std": 0.27586251497268677, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28067126870155334, "rewards/KeyDiagnosticEvidenceORM/std": 0.13647302985191345, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/mean_length": 805.1041870117188, "completions/min_length": 705.0, "entropy/max": 0.27734375, "entropy/mean": 0.1865234375, "entropy/min": 0.13330078125, "epoch": 0.5882352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 0.1036115363240242, "learning_rate": 3.6955438771766254e-07, "loss": 0.0011088873725384474, "reward": 1.8971809148788452, "reward_std": 0.22369924187660217, "rewards/DiagnosisAccuracyORM/mean": 0.5198660492897034, "rewards/DiagnosisAccuracyORM/std": 0.33169296383857727, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.37731480598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.13276490569114685, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/mean_length": 819.7916870117188, "completions/min_length": 585.0, "entropy/max": 0.2998046875, "entropy/mean": 0.20068359375, "entropy/min": 0.1318359375, "epoch": 0.59026369168357, "frac_reward_zero_std": 0.0, "grad_norm": 0.11428386718034744, "learning_rate": 3.6644974170575904e-07, "loss": 0.024397214874625206, "reward": 1.8830937147140503, "reward_std": 0.1789446473121643, "rewards/DiagnosisAccuracyORM/mean": 0.5387648940086365, "rewards/DiagnosisAccuracyORM/std": 0.22368431091308594, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34432873129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.13459552824497223, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/mean_length": 822.5625, "completions/min_length": 644.0, "entropy/max": 0.3330078125, "entropy/mean": 0.19921875, "entropy/min": 0.1416015625, "epoch": 0.592292089249493, "frac_reward_zero_std": 0.0, "grad_norm": 0.11351858824491501, "learning_rate": 3.633506305117172e-07, "loss": -0.0034329346381127834, "reward": 1.8245370388031006, "reward_std": 0.23952631652355194, "rewards/DiagnosisAccuracyORM/mean": 0.5374999642372131, "rewards/DiagnosisAccuracyORM/std": 0.32368117570877075, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28703704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.09729959070682526, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/mean_length": 833.2291870117188, "completions/min_length": 669.0, "entropy/max": 0.28515625, "entropy/mean": 0.212890625, "entropy/min": 0.1630859375, "epoch": 0.5943204868154158, "frac_reward_zero_std": 0.0, "grad_norm": 0.1179644837975502, "learning_rate": 3.602571825741953e-07, "loss": -0.0009258538484573364, "reward": 2.0497519969940186, "reward_std": 0.1950395405292511, "rewards/DiagnosisAccuracyORM/mean": 0.7216269969940186, "rewards/DiagnosisAccuracyORM/std": 0.2444288432598114, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.328125, "rewards/KeyDiagnosticEvidenceORM/std": 0.10276839882135391, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/mean_length": 827.6666870117188, "completions/min_length": 686.0, "entropy/max": 0.3115234375, "entropy/mean": 0.20703125, "entropy/min": 0.126220703125, "epoch": 0.5963488843813387, "frac_reward_zero_std": 0.0, "grad_norm": 0.13400103151798248, "learning_rate": 3.5716952609714514e-07, "loss": -0.008936728350818157, "reward": 1.750520944595337, "reward_std": 0.27706700563430786, "rewards/DiagnosisAccuracyORM/mean": 0.4895254671573639, "rewards/DiagnosisAccuracyORM/std": 0.29079893231391907, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26099535822868347, "rewards/KeyDiagnosticEvidenceORM/std": 0.08435923606157303, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/mean_length": 824.7916870117188, "completions/min_length": 651.0, "entropy/max": 0.455078125, "entropy/mean": 0.21875, "entropy/min": 0.1376953125, "epoch": 0.5983772819472617, "frac_reward_zero_std": 0.0, "grad_norm": 0.11515460908412933, "learning_rate": 3.5408778904449887e-07, "loss": 0.010316319763660431, "reward": 1.8320602178573608, "reward_std": 0.21995408833026886, "rewards/DiagnosisAccuracyORM/mean": 0.49583330750465393, "rewards/DiagnosisAccuracyORM/std": 0.3685397207736969, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33622685074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.14339493215084076, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/mean_length": 816.0, "completions/min_length": 635.0, "entropy/max": 0.392578125, "entropy/mean": 0.22607421875, "entropy/min": 0.13525390625, "epoch": 0.6004056795131846, "frac_reward_zero_std": 0.0, "grad_norm": 0.14219045639038086, "learning_rate": 3.510120991348665e-07, "loss": 0.002890348434448242, "reward": 1.7322916984558105, "reward_std": 0.2482488751411438, "rewards/DiagnosisAccuracyORM/mean": 0.43020835518836975, "rewards/DiagnosisAccuracyORM/std": 0.26396360993385315, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3020833432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.1331096738576889, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/mean_length": 789.625, "completions/min_length": 639.0, "entropy/max": 0.28125, "entropy/mean": 0.19189453125, "entropy/min": 0.12744140625, "epoch": 0.6024340770791075, "frac_reward_zero_std": 0.0, "grad_norm": 0.12068392336368561, "learning_rate": 3.479425838362411e-07, "loss": 0.006175441201776266, "reward": 1.879778504371643, "reward_std": 0.1713634431362152, "rewards/DiagnosisAccuracyORM/mean": 0.5886905193328857, "rewards/DiagnosisAccuracyORM/std": 0.2927776277065277, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2910879850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.13859182596206665, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 802.2708740234375, "completions/min_length": 631.0, "entropy/max": 0.314453125, "entropy/mean": 0.2041015625, "entropy/min": 0.130126953125, "epoch": 0.6044624746450304, "frac_reward_zero_std": 0.0, "grad_norm": 0.12618593871593475, "learning_rate": 3.4487937036071744e-07, "loss": 0.02458568662405014, "reward": 1.9480323791503906, "reward_std": 0.24827712774276733, "rewards/DiagnosisAccuracyORM/mean": 0.6083333492279053, "rewards/DiagnosisAccuracyORM/std": 0.3033841848373413, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3396991193294525, "rewards/KeyDiagnosticEvidenceORM/std": 0.1588224619626999, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/mean_length": 830.5625, "completions/min_length": 636.0, "entropy/max": 0.3544921875, "entropy/mean": 0.23046875, "entropy/min": 0.14794921875, "epoch": 0.6064908722109533, "frac_reward_zero_std": 0.0, "grad_norm": 0.15080368518829346, "learning_rate": 3.418225856592193e-07, "loss": -0.012451840564608574, "reward": 1.6567130088806152, "reward_std": 0.19103948771953583, "rewards/DiagnosisAccuracyORM/mean": 0.4124999940395355, "rewards/DiagnosisAccuracyORM/std": 0.2609878480434418, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24421297013759613, "rewards/KeyDiagnosticEvidenceORM/std": 0.10022930055856705, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 793.5625, "completions/min_length": 676.0, "entropy/max": 0.2705078125, "entropy/mean": 0.193359375, "entropy/min": 0.14501953125, "epoch": 0.6085192697768763, "frac_reward_zero_std": 0.0, "grad_norm": 0.12725411355495453, "learning_rate": 3.387723564162379e-07, "loss": 0.014845555648207664, "reward": 1.893981695175171, "reward_std": 0.24331572651863098, "rewards/DiagnosisAccuracyORM/mean": 0.5461805462837219, "rewards/DiagnosisAccuracyORM/std": 0.2861301004886627, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3478008806705475, "rewards/KeyDiagnosticEvidenceORM/std": 0.1203310489654541, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/mean_length": 825.6041870117188, "completions/min_length": 564.0, "entropy/max": 0.375, "entropy/mean": 0.2236328125, "entropy/min": 0.14599609375, "epoch": 0.6105476673427992, "frac_reward_zero_std": 0.0, "grad_norm": 0.10703917592763901, "learning_rate": 3.357288090445826e-07, "loss": -0.02109910547733307, "reward": 1.8188657760620117, "reward_std": 0.1635793149471283, "rewards/DiagnosisAccuracyORM/mean": 0.5295138955116272, "rewards/DiagnosisAccuracyORM/std": 0.3063018023967743, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28935185074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.11342643946409225, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/mean_length": 817.3541870117188, "completions/min_length": 654.0, "entropy/max": 0.26171875, "entropy/mean": 0.2119140625, "entropy/min": 0.14794921875, "epoch": 0.6125760649087221, "frac_reward_zero_std": 0.0, "grad_norm": 0.13774347305297852, "learning_rate": 3.3269206968014097e-07, "loss": -0.02487466298043728, "reward": 1.725760579109192, "reward_std": 0.22313351929187775, "rewards/DiagnosisAccuracyORM/mean": 0.42599204182624817, "rewards/DiagnosisAccuracyORM/std": 0.27951472997665405, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2997685372829437, "rewards/KeyDiagnosticEvidenceORM/std": 0.09221009910106659, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/mean_length": 813.1666870117188, "completions/min_length": 684.0, "entropy/max": 0.4345703125, "entropy/mean": 0.216796875, "entropy/min": 0.142578125, "epoch": 0.6146044624746451, "frac_reward_zero_std": 0.0, "grad_norm": 0.1276090443134308, "learning_rate": 3.296622641766512e-07, "loss": -0.003202587366104126, "reward": 1.65453040599823, "reward_std": 0.1652543991804123, "rewards/DiagnosisAccuracyORM/mean": 0.32698413729667664, "rewards/DiagnosisAccuracyORM/std": 0.32591813802719116, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32754629850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.1359570324420929, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/mean_length": 798.9583740234375, "completions/min_length": 676.0, "entropy/max": 0.2646484375, "entropy/mean": 0.19921875, "entropy/min": 0.14453125, "epoch": 0.6166328600405679, "frac_reward_zero_std": 0.0, "grad_norm": 0.11145465821027756, "learning_rate": 3.2663951810048676e-07, "loss": 0.002044588327407837, "reward": 1.7683863639831543, "reward_std": 0.23361317813396454, "rewards/DiagnosisAccuracyORM/mean": 0.4935019910335541, "rewards/DiagnosisAccuracyORM/std": 0.29632556438446045, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27488425374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.10980909317731857, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/mean_length": 805.8541870117188, "completions/min_length": 657.0, "entropy/max": 0.3095703125, "entropy/mean": 0.2109375, "entropy/min": 0.1494140625, "epoch": 0.6186612576064908, "frac_reward_zero_std": 0.0, "grad_norm": 0.13063707947731018, "learning_rate": 3.236239567254526e-07, "loss": -0.027865014970302582, "reward": 1.6984705924987793, "reward_std": 0.21669170260429382, "rewards/DiagnosisAccuracyORM/mean": 0.38018354773521423, "rewards/DiagnosisAccuracyORM/std": 0.2613776624202728, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31828704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.10549617558717728, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/mean_length": 805.0208740234375, "completions/min_length": 676.0, "entropy/max": 0.2939453125, "entropy/mean": 0.19873046875, "entropy/min": 0.142578125, "epoch": 0.6206896551724138, "frac_reward_zero_std": 0.0, "grad_norm": 0.13094471395015717, "learning_rate": 3.206157050275927e-07, "loss": -0.024251852184534073, "reward": 1.8013889789581299, "reward_std": 0.27650636434555054, "rewards/DiagnosisAccuracyORM/mean": 0.48020830750465393, "rewards/DiagnosisAccuracyORM/std": 0.332119345664978, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3211805820465088, "rewards/KeyDiagnosticEvidenceORM/std": 0.11183367669582367, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/mean_length": 821.625, "completions/min_length": 576.0, "entropy/max": 0.28564453125, "entropy/mean": 0.20458984375, "entropy/min": 0.136962890625, "epoch": 0.6227180527383367, "frac_reward_zero_std": 0.0, "grad_norm": 0.13102981448173523, "learning_rate": 3.176148876800108e-07, "loss": 0.01701786182820797, "reward": 1.958217740058899, "reward_std": 0.2570303678512573, "rewards/DiagnosisAccuracyORM/mean": 0.5704861283302307, "rewards/DiagnosisAccuracyORM/std": 0.25095537304878235, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3877314627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.17304182052612305, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/mean_length": 818.0208740234375, "completions/min_length": 655.0, "entropy/max": 0.2626953125, "entropy/mean": 0.20166015625, "entropy/min": 0.13916015625, "epoch": 0.6247464503042597, "frac_reward_zero_std": 0.0, "grad_norm": 0.11774726212024689, "learning_rate": 3.146216290477037e-07, "loss": -0.019256845116615295, "reward": 1.6974455118179321, "reward_std": 0.19087855517864227, "rewards/DiagnosisAccuracyORM/mean": 0.4165426790714264, "rewards/DiagnosisAccuracyORM/std": 0.2665667235851288, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28090277314186096, "rewards/KeyDiagnosticEvidenceORM/std": 0.11281731724739075, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/mean_length": 827.6458740234375, "completions/min_length": 702.0, "entropy/max": 0.2724609375, "entropy/mean": 0.203125, "entropy/min": 0.12646484375, "epoch": 0.6267748478701826, "frac_reward_zero_std": 0.0, "grad_norm": 0.12023108452558517, "learning_rate": 3.1163605318240736e-07, "loss": 0.01437501609325409, "reward": 1.7758266925811768, "reward_std": 0.1869094967842102, "rewards/DiagnosisAccuracyORM/mean": 0.4384424686431885, "rewards/DiagnosisAccuracyORM/std": 0.30036965012550354, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33738425374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.1269695907831192, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/mean_length": 840.7083740234375, "completions/min_length": 659.0, "entropy/max": 0.318359375, "entropy/mean": 0.22265625, "entropy/min": 0.16015625, "epoch": 0.6288032454361054, "frac_reward_zero_std": 0.0, "grad_norm": 0.15167920291423798, "learning_rate": 3.086582838174551e-07, "loss": -0.005076955072581768, "reward": 1.8820602893829346, "reward_std": 0.17692601680755615, "rewards/DiagnosisAccuracyORM/mean": 0.5302083492279053, "rewards/DiagnosisAccuracyORM/std": 0.3769301176071167, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35185185074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.11817573010921478, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/mean_length": 820.1666870117188, "completions/min_length": 662.0, "entropy/max": 0.267578125, "entropy/mean": 0.19482421875, "entropy/min": 0.12890625, "epoch": 0.6308316430020284, "frac_reward_zero_std": 0.0, "grad_norm": 0.12848792970180511, "learning_rate": 3.0568844436264985e-07, "loss": 0.011742839589715004, "reward": 1.944328784942627, "reward_std": 0.2882111072540283, "rewards/DiagnosisAccuracyORM/mean": 0.5982639193534851, "rewards/DiagnosisAccuracyORM/std": 0.3224140405654907, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34606480598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.11140008270740509, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/mean_length": 802.1458740234375, "completions/min_length": 673.0, "entropy/max": 0.2470703125, "entropy/mean": 0.1884765625, "entropy/min": 0.13720703125, "epoch": 0.6328600405679513, "frac_reward_zero_std": 0.0, "grad_norm": 0.11310470104217529, "learning_rate": 3.027266578991496e-07, "loss": 0.008660441264510155, "reward": 1.9945602416992188, "reward_std": 0.17911013960838318, "rewards/DiagnosisAccuracyORM/mean": 0.5888888835906982, "rewards/DiagnosisAccuracyORM/std": 0.22707758843898773, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.40567126870155334, "rewards/KeyDiagnosticEvidenceORM/std": 0.12962479889392853, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/mean_length": 823.4375, "completions/min_length": 657.0, "entropy/max": 0.2890625, "entropy/mean": 0.21484375, "entropy/min": 0.16015625, "epoch": 0.6348884381338742, "frac_reward_zero_std": 0.0, "grad_norm": 0.14143694937229156, "learning_rate": 2.997730471743667e-07, "loss": -0.005622841417789459, "reward": 1.983019232749939, "reward_std": 0.1776854395866394, "rewards/DiagnosisAccuracyORM/mean": 0.6508432030677795, "rewards/DiagnosisAccuracyORM/std": 0.318546861410141, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33217594027519226, "rewards/KeyDiagnosticEvidenceORM/std": 0.10780518501996994, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/mean_length": 811.5, "completions/min_length": 657.0, "entropy/max": 0.294921875, "entropy/mean": 0.21240234375, "entropy/min": 0.15576171875, "epoch": 0.6369168356997972, "frac_reward_zero_std": 0.0, "grad_norm": 0.12356024235486984, "learning_rate": 2.968277345968808e-07, "loss": 0.016775818541646004, "reward": 1.979968786239624, "reward_std": 0.2443697154521942, "rewards/DiagnosisAccuracyORM/mean": 0.5870288014411926, "rewards/DiagnosisAccuracyORM/std": 0.2201005518436432, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.39293980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.20030754804611206, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/mean_length": 806.4791870117188, "completions/min_length": 683.0, "entropy/max": 0.23779296875, "entropy/mean": 0.18896484375, "entropy/min": 0.1435546875, "epoch": 0.6389452332657201, "frac_reward_zero_std": 0.0, "grad_norm": 0.12164712697267532, "learning_rate": 2.938908422313652e-07, "loss": -0.0016990478616207838, "reward": 1.9269676208496094, "reward_std": 0.1941017508506775, "rewards/DiagnosisAccuracyORM/mean": 0.5739583373069763, "rewards/DiagnosisAccuracyORM/std": 0.31734541058540344, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3530092239379883, "rewards/KeyDiagnosticEvidenceORM/std": 0.13535192608833313, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/mean_length": 816.4166870117188, "completions/min_length": 691.0, "entropy/max": 0.259765625, "entropy/mean": 0.1923828125, "entropy/min": 0.1435546875, "epoch": 0.640973630831643, "frac_reward_zero_std": 0.0, "grad_norm": 0.116903156042099, "learning_rate": 2.909624917935283e-07, "loss": 0.003070443868637085, "reward": 1.7460317611694336, "reward_std": 0.1991085261106491, "rewards/DiagnosisAccuracyORM/mean": 0.3866567611694336, "rewards/DiagnosisAccuracyORM/std": 0.2711532413959503, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.359375, "rewards/KeyDiagnosticEvidenceORM/std": 0.12414750456809998, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 835.75, "completions/min_length": 670.0, "entropy/max": 0.3447265625, "entropy/mean": 0.1953125, "entropy/min": 0.125244140625, "epoch": 0.6430020283975659, "frac_reward_zero_std": 0.0, "grad_norm": 0.11679522693157196, "learning_rate": 2.880428046450697e-07, "loss": 0.0012269641738384962, "reward": 1.5982639789581299, "reward_std": 0.21866819262504578, "rewards/DiagnosisAccuracyORM/mean": 0.2927083373069763, "rewards/DiagnosisAccuracyORM/std": 0.29087597131729126, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3055555820465088, "rewards/KeyDiagnosticEvidenceORM/std": 0.1572643369436264, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/mean_length": 832.5833740234375, "completions/min_length": 629.0, "entropy/max": 0.314453125, "entropy/mean": 0.20556640625, "entropy/min": 0.13330078125, "epoch": 0.6450304259634888, "frac_reward_zero_std": 0.0, "grad_norm": 0.12037487328052521, "learning_rate": 2.8513190178865e-07, "loss": -0.0015516355633735657, "reward": 1.7887732982635498, "reward_std": 0.19148367643356323, "rewards/DiagnosisAccuracyORM/mean": 0.4722222089767456, "rewards/DiagnosisAccuracyORM/std": 0.3003019392490387, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31655094027519226, "rewards/KeyDiagnosticEvidenceORM/std": 0.10700749605894089, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/mean_length": 800.8333740234375, "completions/min_length": 657.0, "entropy/max": 0.2548828125, "entropy/mean": 0.1845703125, "entropy/min": 0.146484375, "epoch": 0.6470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.11191432923078537, "learning_rate": 2.8222990386287614e-07, "loss": -0.015797540545463562, "reward": 1.6707425117492676, "reward_std": 0.19372594356536865, "rewards/DiagnosisAccuracyORM/mean": 0.36808037757873535, "rewards/DiagnosisAccuracyORM/std": 0.2665584087371826, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30266204476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.11563479900360107, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/mean_length": 798.8125, "completions/min_length": 664.0, "entropy/max": 0.2998046875, "entropy/mean": 0.19873046875, "entropy/min": 0.135498046875, "epoch": 0.6490872210953347, "frac_reward_zero_std": 0.0, "grad_norm": 0.1461854875087738, "learning_rate": 2.793369311373021e-07, "loss": -0.002720564603805542, "reward": 1.6611111164093018, "reward_std": 0.14389313757419586, "rewards/DiagnosisAccuracyORM/mean": 0.37291669845581055, "rewards/DiagnosisAccuracyORM/std": 0.22977593541145325, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2881944477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.16761207580566406, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/mean_length": 816.4166870117188, "completions/min_length": 685.0, "entropy/max": 0.2578125, "entropy/mean": 0.1796875, "entropy/min": 0.13525390625, "epoch": 0.6511156186612576, "frac_reward_zero_std": 0.0, "grad_norm": 0.11879252642393112, "learning_rate": 2.7645310350744293e-07, "loss": -0.010624796152114868, "reward": 1.9663195610046387, "reward_std": 0.24828308820724487, "rewards/DiagnosisAccuracyORM/mean": 0.6260417103767395, "rewards/DiagnosisAccuracyORM/std": 0.3786861002445221, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3402777910232544, "rewards/KeyDiagnosticEvidenceORM/std": 0.14704202115535736, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/mean_length": 820.9583740234375, "completions/min_length": 609.0, "entropy/max": 0.3095703125, "entropy/mean": 0.203125, "entropy/min": 0.13427734375, "epoch": 0.6531440162271805, "frac_reward_zero_std": 0.0, "grad_norm": 0.1395290344953537, "learning_rate": 2.7357854048980886e-07, "loss": -0.039947398006916046, "reward": 1.5885417461395264, "reward_std": 0.17816512286663055, "rewards/DiagnosisAccuracyORM/mean": 0.2708333432674408, "rewards/DiagnosisAccuracyORM/std": 0.21789070963859558, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3177083432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.11168678849935532, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/mean_length": 808.6458740234375, "completions/min_length": 675.0, "entropy/max": 0.3095703125, "entropy/mean": 0.19970703125, "entropy/min": 0.1416015625, "epoch": 0.6551724137931034, "frac_reward_zero_std": 0.0, "grad_norm": 0.125824436545372, "learning_rate": 2.707133612169485e-07, "loss": 0.013002999126911163, "reward": 1.9192460775375366, "reward_std": 0.23383042216300964, "rewards/DiagnosisAccuracyORM/mean": 0.5838293433189392, "rewards/DiagnosisAccuracyORM/std": 0.31903132796287537, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33541667461395264, "rewards/KeyDiagnosticEvidenceORM/std": 0.157850444316864, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/mean_length": 797.7083740234375, "completions/min_length": 657.0, "entropy/max": 0.2802734375, "entropy/mean": 0.1953125, "entropy/min": 0.12646484375, "epoch": 0.6572008113590264, "frac_reward_zero_std": 0.0, "grad_norm": 0.12907494604587555, "learning_rate": 2.6785768443251433e-07, "loss": -0.015093538910150528, "reward": 1.6584491729736328, "reward_std": 0.15749172866344452, "rewards/DiagnosisAccuracyORM/mean": 0.33888891339302063, "rewards/DiagnosisAccuracyORM/std": 0.32593443989753723, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3195601999759674, "rewards/KeyDiagnosticEvidenceORM/std": 0.09282783418893814, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/mean_length": 801.7708740234375, "completions/min_length": 588.0, "entropy/max": 0.2607421875, "entropy/mean": 0.1923828125, "entropy/min": 0.13671875, "epoch": 0.6592292089249493, "frac_reward_zero_std": 0.0, "grad_norm": 0.13889957964420319, "learning_rate": 2.6501162848634016e-07, "loss": -0.007493411656469107, "reward": 1.8382772207260132, "reward_std": 0.24297836422920227, "rewards/DiagnosisAccuracyORM/mean": 0.5113095641136169, "rewards/DiagnosisAccuracyORM/std": 0.3533617854118347, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32696759700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.12438971549272537, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/mean_length": 831.5625, "completions/min_length": 657.0, "entropy/max": 0.298828125, "entropy/mean": 0.19580078125, "entropy/min": 0.13671875, "epoch": 0.6612576064908722, "frac_reward_zero_std": 0.0, "grad_norm": 0.13292191922664642, "learning_rate": 2.6217531132953607e-07, "loss": 0.014453306794166565, "reward": 1.8210813999176025, "reward_std": 0.2437230795621872, "rewards/DiagnosisAccuracyORM/mean": 0.48080357909202576, "rewards/DiagnosisAccuracyORM/std": 0.2693110406398773, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3402777910232544, "rewards/KeyDiagnosticEvidenceORM/std": 0.08904758095741272, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/mean_length": 803.375, "completions/min_length": 618.0, "entropy/max": 0.3818359375, "entropy/mean": 0.20068359375, "entropy/min": 0.134765625, "epoch": 0.6632860040567952, "frac_reward_zero_std": 0.0, "grad_norm": 0.13762451708316803, "learning_rate": 2.593488505096018e-07, "loss": 0.00039137405110523105, "reward": 1.6723875999450684, "reward_std": 0.22311294078826904, "rewards/DiagnosisAccuracyORM/mean": 0.40560516715049744, "rewards/DiagnosisAccuracyORM/std": 0.2773529589176178, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2667824327945709, "rewards/KeyDiagnosticEvidenceORM/std": 0.096009761095047, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/mean_length": 814.3125, "completions/min_length": 660.0, "entropy/max": 0.294921875, "entropy/mean": 0.19091796875, "entropy/min": 0.1337890625, "epoch": 0.665314401622718, "frac_reward_zero_std": 0.0, "grad_norm": 0.13504751026630402, "learning_rate": 2.5653236316555315e-07, "loss": 0.0061257099732756615, "reward": 1.7197751998901367, "reward_std": 0.21671107411384583, "rewards/DiagnosisAccuracyORM/mean": 0.4466269910335541, "rewards/DiagnosisAccuracyORM/std": 0.30688920617103577, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27314814925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.13033132255077362, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/mean_length": 827.9375, "completions/min_length": 682.0, "entropy/max": 0.357421875, "entropy/mean": 0.20556640625, "entropy/min": 0.1455078125, "epoch": 0.6673427991886409, "frac_reward_zero_std": 0.0, "grad_norm": 0.13796722888946533, "learning_rate": 2.5372596602306784e-07, "loss": -0.0014111350756138563, "reward": 1.6484954357147217, "reward_std": 0.14637556672096252, "rewards/DiagnosisAccuracyORM/mean": 0.36145833134651184, "rewards/DiagnosisAccuracyORM/std": 0.27963072061538696, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28703704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.1207863911986351, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/mean_length": 804.2083740234375, "completions/min_length": 639.0, "entropy/max": 0.2861328125, "entropy/mean": 0.19140625, "entropy/min": 0.12890625, "epoch": 0.6693711967545639, "frac_reward_zero_std": 0.0, "grad_norm": 0.11541479080915451, "learning_rate": 2.5092977538964883e-07, "loss": -0.007840260863304138, "reward": 1.6365079879760742, "reward_std": 0.22468839585781097, "rewards/DiagnosisAccuracyORM/mean": 0.38477182388305664, "rewards/DiagnosisAccuracyORM/std": 0.33312004804611206, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2517361342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.11608351767063141, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/mean_length": 825.0208740234375, "completions/min_length": 643.0, "entropy/max": 0.28515625, "entropy/mean": 0.203125, "entropy/min": 0.15771484375, "epoch": 0.6713995943204868, "frac_reward_zero_std": 0.0, "grad_norm": 0.1250486671924591, "learning_rate": 2.481439071498032e-07, "loss": -0.005115404725074768, "reward": 1.7228009700775146, "reward_std": 0.28666090965270996, "rewards/DiagnosisAccuracyORM/mean": 0.4635416567325592, "rewards/DiagnosisAccuracyORM/std": 0.3070806860923767, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25925925374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.11248169839382172, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/mean_length": 821.6875, "completions/min_length": 695.0, "entropy/max": 0.275390625, "entropy/mean": 0.18994140625, "entropy/min": 0.129150390625, "epoch": 0.6734279918864098, "frac_reward_zero_std": 0.0, "grad_norm": 0.1355646401643753, "learning_rate": 2.453684767602399e-07, "loss": -0.009737730026245117, "reward": 1.7787781953811646, "reward_std": 0.2541698217391968, "rewards/DiagnosisAccuracyORM/mean": 0.4240327775478363, "rewards/DiagnosisAccuracyORM/std": 0.2724495530128479, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35474538803100586, "rewards/KeyDiagnosticEvidenceORM/std": 0.126353919506073, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/mean_length": 795.8541870117188, "completions/min_length": 637.0, "entropy/max": 0.404296875, "entropy/mean": 0.22265625, "entropy/min": 0.13525390625, "epoch": 0.6754563894523327, "frac_reward_zero_std": 0.0, "grad_norm": 0.1417602300643921, "learning_rate": 2.426035992450848e-07, "loss": 0.0030827857553958893, "reward": 1.7202050685882568, "reward_std": 0.21877598762512207, "rewards/DiagnosisAccuracyORM/mean": 0.4464782178401947, "rewards/DiagnosisAccuracyORM/std": 0.31425538659095764, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27372685074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.14416566491127014, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/mean_length": 800.0208740234375, "completions/min_length": 666.0, "entropy/max": 0.26171875, "entropy/mean": 0.1865234375, "entropy/min": 0.1416015625, "epoch": 0.6774847870182555, "frac_reward_zero_std": 0.0, "grad_norm": 0.13248726725578308, "learning_rate": 2.398493891911127e-07, "loss": 0.014865731820464134, "reward": 1.57511568069458, "reward_std": 0.1438671499490738, "rewards/DiagnosisAccuracyORM/mean": 0.2875000238418579, "rewards/DiagnosisAccuracyORM/std": 0.25800424814224243, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28761574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.1350521594285965, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/mean_length": 826.2083740234375, "completions/min_length": 694.0, "entropy/max": 0.3330078125, "entropy/mean": 0.2099609375, "entropy/min": 0.12744140625, "epoch": 0.6795131845841785, "frac_reward_zero_std": 0.0, "grad_norm": 0.12428087741136551, "learning_rate": 2.3710596074300043e-07, "loss": 0.0020538445096462965, "reward": 1.8528770208358765, "reward_std": 0.22455526888370514, "rewards/DiagnosisAccuracyORM/mean": 0.5438492298126221, "rewards/DiagnosisAccuracyORM/std": 0.2408527433872223, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.309027761220932, "rewards/KeyDiagnosticEvidenceORM/std": 0.10851351916790009, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 823.875, "completions/min_length": 691.0, "entropy/max": 0.3720703125, "entropy/mean": 0.21435546875, "entropy/min": 0.14306640625, "epoch": 0.6815415821501014, "frac_reward_zero_std": 0.0, "grad_norm": 0.10166420787572861, "learning_rate": 2.3437342759859468e-07, "loss": 0.010420303791761398, "reward": 1.787731409072876, "reward_std": 0.2718750536441803, "rewards/DiagnosisAccuracyORM/mean": 0.47465279698371887, "rewards/DiagnosisAccuracyORM/std": 0.280133992433548, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31307873129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.10577301681041718, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/mean_length": 806.7291870117188, "completions/min_length": 580.0, "entropy/max": 0.296875, "entropy/mean": 0.197265625, "entropy/min": 0.12890625, "epoch": 0.6835699797160243, "frac_reward_zero_std": 0.0, "grad_norm": 0.13692396879196167, "learning_rate": 2.3165190300419978e-07, "loss": 0.00700274296104908, "reward": 1.686689853668213, "reward_std": 0.19201889634132385, "rewards/DiagnosisAccuracyORM/mean": 0.37361109256744385, "rewards/DiagnosisAccuracyORM/std": 0.22187767922878265, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31307873129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.14950443804264069, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/mean_length": 837.4375, "completions/min_length": 704.0, "entropy/max": 0.453125, "entropy/mean": 0.23193359375, "entropy/min": 0.15673828125, "epoch": 0.6855983772819473, "frac_reward_zero_std": 0.0, "grad_norm": 0.11292163282632828, "learning_rate": 2.2894149974988557e-07, "loss": 0.0005526195163838565, "reward": 1.8186920881271362, "reward_std": 0.18890495598316193, "rewards/DiagnosisAccuracyORM/mean": 0.49809029698371887, "rewards/DiagnosisAccuracyORM/std": 0.30851393938064575, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32060185074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.14664606750011444, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/mean_length": 796.25, "completions/min_length": 624.0, "entropy/max": 0.263671875, "entropy/mean": 0.19677734375, "entropy/min": 0.15966796875, "epoch": 0.6876267748478702, "frac_reward_zero_std": 0.0, "grad_norm": 0.12201409786939621, "learning_rate": 2.262423301648122e-07, "loss": -0.009645149111747742, "reward": 1.7438328266143799, "reward_std": 0.24426314234733582, "rewards/DiagnosisAccuracyORM/mean": 0.43769845366477966, "rewards/DiagnosisAccuracyORM/std": 0.2810027301311493, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30613425374031067, "rewards/KeyDiagnosticEvidenceORM/std": 0.1220245510339737, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/mean_length": 799.375, "completions/min_length": 670.0, "entropy/max": 0.29296875, "entropy/mean": 0.203125, "entropy/min": 0.13916015625, "epoch": 0.6896551724137931, "frac_reward_zero_std": 0.0, "grad_norm": 0.12638594210147858, "learning_rate": 2.2355450611257476e-07, "loss": 0.009642882272601128, "reward": 1.6765047311782837, "reward_std": 0.17833998799324036, "rewards/DiagnosisAccuracyORM/mean": 0.3645833432674408, "rewards/DiagnosisAccuracyORM/std": 0.2346373051404953, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3119213283061981, "rewards/KeyDiagnosticEvidenceORM/std": 0.12918075919151306, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/mean_length": 809.9791870117188, "completions/min_length": 653.0, "entropy/max": 0.2900390625, "entropy/mean": 0.193359375, "entropy/min": 0.1396484375, "epoch": 0.691683569979716, "frac_reward_zero_std": 0.0, "grad_norm": 0.10148575156927109, "learning_rate": 2.208781389865677e-07, "loss": 0.028875529766082764, "reward": 1.736689805984497, "reward_std": 0.2013435661792755, "rewards/DiagnosisAccuracyORM/mean": 0.4131944179534912, "rewards/DiagnosisAccuracyORM/std": 0.25606030225753784, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32349538803100586, "rewards/KeyDiagnosticEvidenceORM/std": 0.09283667802810669, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/mean_length": 838.9791870117188, "completions/min_length": 669.0, "entropy/max": 0.326171875, "entropy/mean": 0.22021484375, "entropy/min": 0.1708984375, "epoch": 0.6937119675456389, "frac_reward_zero_std": 0.0, "grad_norm": 0.13296642899513245, "learning_rate": 2.182133397053675e-07, "loss": 0.0064565567299723625, "reward": 1.797784447669983, "reward_std": 0.2639722228050232, "rewards/DiagnosisAccuracyORM/mean": 0.4435019791126251, "rewards/DiagnosisAccuracyORM/std": 0.3319928050041199, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.354282408952713, "rewards/KeyDiagnosticEvidenceORM/std": 0.16126078367233276, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/mean_length": 832.7708740234375, "completions/min_length": 738.0, "entropy/max": 0.30859375, "entropy/mean": 0.21142578125, "entropy/min": 0.13623046875, "epoch": 0.6957403651115619, "frac_reward_zero_std": 0.0, "grad_norm": 0.15108934044837952, "learning_rate": 2.1556021870813651e-07, "loss": -0.0024840906262397766, "reward": 1.86087965965271, "reward_std": 0.2346963733434677, "rewards/DiagnosisAccuracyORM/mean": 0.5732638835906982, "rewards/DiagnosisAccuracyORM/std": 0.3185320198535919, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28761574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.14947697520256042, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/mean_length": 831.25, "completions/min_length": 689.0, "entropy/max": 0.4326171875, "entropy/mean": 0.22509765625, "entropy/min": 0.1591796875, "epoch": 0.6977687626774848, "frac_reward_zero_std": 0.0, "grad_norm": 0.11994815617799759, "learning_rate": 2.1291888595004587e-07, "loss": 0.003087341785430908, "reward": 1.773966670036316, "reward_std": 0.2547268867492676, "rewards/DiagnosisAccuracyORM/mean": 0.47477683424949646, "rewards/DiagnosisAccuracyORM/std": 0.3041379749774933, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29918983578681946, "rewards/KeyDiagnosticEvidenceORM/std": 0.14922964572906494, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/mean_length": 834.2916870117188, "completions/min_length": 655.0, "entropy/max": 0.4755859375, "entropy/mean": 0.23876953125, "entropy/min": 0.1474609375, "epoch": 0.6997971602434077, "frac_reward_zero_std": 0.0, "grad_norm": 0.13961556553840637, "learning_rate": 2.1028945089771816e-07, "loss": 0.017673831433057785, "reward": 1.891575813293457, "reward_std": 0.222183495759964, "rewards/DiagnosisAccuracyORM/mean": 0.5413442254066467, "rewards/DiagnosisAccuracyORM/std": 0.37860211730003357, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35023149847984314, "rewards/KeyDiagnosticEvidenceORM/std": 0.2027812898159027, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/mean_length": 829.9166870117188, "completions/min_length": 683.0, "entropy/max": 0.3125, "entropy/mean": 0.220703125, "entropy/min": 0.15185546875, "epoch": 0.7018255578093306, "frac_reward_zero_std": 0.0, "grad_norm": 0.1152975857257843, "learning_rate": 2.0767202252469113e-07, "loss": 0.013922689482569695, "reward": 1.7460484504699707, "reward_std": 0.1854642778635025, "rewards/DiagnosisAccuracyORM/mean": 0.43759921193122864, "rewards/DiagnosisAccuracyORM/std": 0.3034621775150299, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30844905972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.11449337005615234, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/mean_length": 819.4375, "completions/min_length": 706.0, "entropy/max": 0.423828125, "entropy/mean": 0.23388671875, "entropy/min": 0.158203125, "epoch": 0.7038539553752535, "frac_reward_zero_std": 0.0, "grad_norm": 0.13570375740528107, "learning_rate": 2.0506670930690073e-07, "loss": 0.0029191970825195312, "reward": 1.6889550685882568, "reward_std": 0.1904650777578354, "rewards/DiagnosisAccuracyORM/mean": 0.4273809492588043, "rewards/DiagnosisAccuracyORM/std": 0.310433566570282, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26157405972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.116707943379879, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/mean_length": 810.2083740234375, "completions/min_length": 653.0, "entropy/max": 0.4287109375, "entropy/mean": 0.2353515625, "entropy/min": 0.15283203125, "epoch": 0.7058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.58298659324646, "learning_rate": 2.0247361921818635e-07, "loss": -0.009811977855861187, "reward": 1.8462632894515991, "reward_std": 0.3103315234184265, "rewards/DiagnosisAccuracyORM/mean": 0.5470734238624573, "rewards/DiagnosisAccuracyORM/std": 0.3603196442127228, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29918980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.13502176105976105, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/mean_length": 799.1666870117188, "completions/min_length": 678.0, "entropy/max": 0.2236328125, "entropy/mean": 0.18359375, "entropy/min": 0.14111328125, "epoch": 0.7079107505070994, "frac_reward_zero_std": 0.0, "grad_norm": 0.11480266600847244, "learning_rate": 1.9989285972581593e-07, "loss": 0.002860677894204855, "reward": 1.8660879135131836, "reward_std": 0.22066381573677063, "rewards/DiagnosisAccuracyORM/mean": 0.5541666746139526, "rewards/DiagnosisAccuracyORM/std": 0.3046817481517792, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31192129850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.14944952726364136, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/mean_length": 816.7291870117188, "completions/min_length": 689.0, "entropy/max": 0.2861328125, "entropy/mean": 0.18115234375, "entropy/min": 0.12890625, "epoch": 0.7099391480730223, "frac_reward_zero_std": 0.0, "grad_norm": 0.1377933770418167, "learning_rate": 1.9732453778603103e-07, "loss": -0.006428162567317486, "reward": 1.725429892539978, "reward_std": 0.25715965032577515, "rewards/DiagnosisAccuracyORM/mean": 0.4654761850833893, "rewards/DiagnosisAccuracyORM/std": 0.2740470767021179, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25995370745658875, "rewards/KeyDiagnosticEvidenceORM/std": 0.12322263419628143, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/mean_length": 800.2708740234375, "completions/min_length": 662.0, "entropy/max": 0.279296875, "entropy/mean": 0.20263671875, "entropy/min": 0.15283203125, "epoch": 0.7119675456389453, "frac_reward_zero_std": 0.0, "grad_norm": 0.13479363918304443, "learning_rate": 1.947687598396154e-07, "loss": 0.00027556222630664706, "reward": 1.8811508417129517, "reward_std": 0.25073370337486267, "rewards/DiagnosisAccuracyORM/mean": 0.5599702000617981, "rewards/DiagnosisAccuracyORM/std": 0.32551127672195435, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3211805522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.1270880550146103, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/mean_length": 786.1458740234375, "completions/min_length": 639.0, "entropy/max": 0.3408203125, "entropy/mean": 0.19091796875, "entropy/min": 0.1357421875, "epoch": 0.7139959432048681, "frac_reward_zero_std": 0.0, "grad_norm": 0.13114720582962036, "learning_rate": 1.9222563180748297e-07, "loss": -0.011180217377841473, "reward": 1.6658565998077393, "reward_std": 0.2455524504184723, "rewards/DiagnosisAccuracyORM/mean": 0.37013888359069824, "rewards/DiagnosisAccuracyORM/std": 0.26946574449539185, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29571759700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.14296495914459229, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/mean_length": 826.25, "completions/min_length": 705.0, "entropy/max": 0.2392578125, "entropy/mean": 0.181640625, "entropy/min": 0.1259765625, "epoch": 0.716024340770791, "frac_reward_zero_std": 0.0, "grad_norm": 0.10545913875102997, "learning_rate": 1.896952590862886e-07, "loss": -0.0013998424401506782, "reward": 1.8280092477798462, "reward_std": 0.154842346906662, "rewards/DiagnosisAccuracyORM/mean": 0.4246527850627899, "rewards/DiagnosisAccuracyORM/std": 0.30334988236427307, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.4033564627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.11517246067523956, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 825.5625, "completions/min_length": 705.0, "entropy/max": 0.328125, "entropy/mean": 0.2216796875, "entropy/min": 0.1552734375, "epoch": 0.718052738336714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1199328750371933, "learning_rate": 1.871777465440596e-07, "loss": -0.002025613095611334, "reward": 1.785069465637207, "reward_std": 0.21714375913143158, "rewards/DiagnosisAccuracyORM/mean": 0.5090277791023254, "rewards/DiagnosisAccuracyORM/std": 0.29108014702796936, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2760416567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.11494654417037964, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/mean_length": 826.5833740234375, "completions/min_length": 652.0, "entropy/max": 0.447265625, "entropy/mean": 0.2236328125, "entropy/min": 0.1416015625, "epoch": 0.7200811359026369, "frac_reward_zero_std": 0.0, "grad_norm": 0.1515570729970932, "learning_rate": 1.846731985158495e-07, "loss": -0.001982375979423523, "reward": 1.7497353553771973, "reward_std": 0.21185937523841858, "rewards/DiagnosisAccuracyORM/mean": 0.42797622084617615, "rewards/DiagnosisAccuracyORM/std": 0.25371676683425903, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32175928354263306, "rewards/KeyDiagnosticEvidenceORM/std": 0.1533096581697464, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/mean_length": 799.0625, "completions/min_length": 661.0, "entropy/max": 0.2353515625, "entropy/mean": 0.1796875, "entropy/min": 0.130126953125, "epoch": 0.7221095334685599, "frac_reward_zero_std": 0.0, "grad_norm": 0.12082468718290329, "learning_rate": 1.8218171879941463e-07, "loss": -0.00969721656292677, "reward": 1.7304399013519287, "reward_std": 0.20811080932617188, "rewards/DiagnosisAccuracyORM/mean": 0.39189815521240234, "rewards/DiagnosisAccuracyORM/std": 0.2347991168498993, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3385416567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.12705576419830322, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/mean_length": 815.0833740234375, "completions/min_length": 684.0, "entropy/max": 0.2666015625, "entropy/mean": 0.1875, "entropy/min": 0.134765625, "epoch": 0.7241379310344828, "frac_reward_zero_std": 0.0, "grad_norm": 0.13887304067611694, "learning_rate": 1.7970341065091243e-07, "loss": -0.0010036826133728027, "reward": 1.6201472282409668, "reward_std": 0.19160068035125732, "rewards/DiagnosisAccuracyORM/mean": 0.3688739836215973, "rewards/DiagnosisAccuracyORM/std": 0.21322856843471527, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25127315521240234, "rewards/KeyDiagnosticEvidenceORM/std": 0.09796484559774399, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/mean_length": 822.6041870117188, "completions/min_length": 691.0, "entropy/max": 0.3134765625, "entropy/mean": 0.2158203125, "entropy/min": 0.166015625, "epoch": 0.7261663286004056, "frac_reward_zero_std": 0.0, "grad_norm": 0.1416293978691101, "learning_rate": 1.772383767806208e-07, "loss": -0.007833379320800304, "reward": 1.867708444595337, "reward_std": 0.2526547312736511, "rewards/DiagnosisAccuracyORM/mean": 0.5760416388511658, "rewards/DiagnosisAccuracyORM/std": 0.2900824546813965, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2916666865348816, "rewards/KeyDiagnosticEvidenceORM/std": 0.12915030121803284, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/mean_length": 805.9583740234375, "completions/min_length": 672.0, "entropy/max": 0.302734375, "entropy/mean": 0.21044921875, "entropy/min": 0.1572265625, "epoch": 0.7281947261663286, "frac_reward_zero_std": 0.0, "grad_norm": 0.13665026426315308, "learning_rate": 1.74786719348683e-07, "loss": -0.005128468386828899, "reward": 1.7435681819915771, "reward_std": 0.24074946343898773, "rewards/DiagnosisAccuracyORM/mean": 0.4773643910884857, "rewards/DiagnosisAccuracyORM/std": 0.32146212458610535, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26620373129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.1622578203678131, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/mean_length": 811.625, "completions/min_length": 595.0, "entropy/max": 0.3369140625, "entropy/mean": 0.20556640625, "entropy/min": 0.1552734375, "epoch": 0.7302231237322515, "frac_reward_zero_std": 0.0, "grad_norm": 0.09829256683588028, "learning_rate": 1.72348539960873e-07, "loss": -0.007042774464935064, "reward": 1.7274553775787354, "reward_std": 0.1721876710653305, "rewards/DiagnosisAccuracyORM/mean": 0.39846229553222656, "rewards/DiagnosisAccuracyORM/std": 0.3644244074821472, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3289930522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.14945495128631592, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/mean_length": 804.2708740234375, "completions/min_length": 677.0, "entropy/max": 0.33203125, "entropy/mean": 0.2109375, "entropy/min": 0.14013671875, "epoch": 0.7322515212981744, "frac_reward_zero_std": 0.0, "grad_norm": 0.12358502298593521, "learning_rate": 1.6992393966438405e-07, "loss": 0.005287293344736099, "reward": 1.6403934955596924, "reward_std": 0.13253462314605713, "rewards/DiagnosisAccuracyORM/mean": 0.3840278089046478, "rewards/DiagnosisAccuracyORM/std": 0.2717589735984802, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25636574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.10428808629512787, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/mean_length": 807.5833740234375, "completions/min_length": 700.0, "entropy/max": 0.2978515625, "entropy/mean": 0.1884765625, "entropy/min": 0.13720703125, "epoch": 0.7342799188640974, "frac_reward_zero_std": 0.0, "grad_norm": 0.13917021453380585, "learning_rate": 1.6751301894364273e-07, "loss": 0.029003646224737167, "reward": 1.8746528625488281, "reward_std": 0.20933866500854492, "rewards/DiagnosisAccuracyORM/mean": 0.49444445967674255, "rewards/DiagnosisAccuracyORM/std": 0.24524638056755066, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3802083432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.12529924511909485, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/mean_length": 836.9166870117188, "completions/min_length": 706.0, "entropy/max": 0.4443359375, "entropy/mean": 0.24267578125, "entropy/min": 0.1572265625, "epoch": 0.7363083164300203, "frac_reward_zero_std": 0.0, "grad_norm": 0.15305203199386597, "learning_rate": 1.6511587771614204e-07, "loss": 0.0051194727420806885, "reward": 1.8717594146728516, "reward_std": 0.24651890993118286, "rewards/DiagnosisAccuracyORM/mean": 0.5951389074325562, "rewards/DiagnosisAccuracyORM/std": 0.33118024468421936, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27662035822868347, "rewards/KeyDiagnosticEvidenceORM/std": 0.14886823296546936, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/mean_length": 822.4791870117188, "completions/min_length": 687.0, "entropy/max": 0.3779296875, "entropy/mean": 0.2109375, "entropy/min": 0.134765625, "epoch": 0.7383367139959433, "frac_reward_zero_std": 0.0, "grad_norm": 0.15543894469738007, "learning_rate": 1.627326153283024e-07, "loss": 0.004101622849702835, "reward": 1.8130788803100586, "reward_std": 0.23521190881729126, "rewards/DiagnosisAccuracyORM/mean": 0.5173611044883728, "rewards/DiagnosisAccuracyORM/std": 0.3433074653148651, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29571759700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.1726173460483551, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/mean_length": 818.9791870117188, "completions/min_length": 651.0, "entropy/max": 0.283203125, "entropy/mean": 0.20458984375, "entropy/min": 0.140625, "epoch": 0.7403651115618661, "frac_reward_zero_std": 0.0, "grad_norm": 0.12457302957773209, "learning_rate": 1.6036333055135344e-07, "loss": 0.014871887862682343, "reward": 1.7354166507720947, "reward_std": 0.2035483419895172, "rewards/DiagnosisAccuracyORM/mean": 0.4263888895511627, "rewards/DiagnosisAccuracyORM/std": 0.24492083489894867, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3090277910232544, "rewards/KeyDiagnosticEvidenceORM/std": 0.14136388897895813, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/mean_length": 817.625, "completions/min_length": 624.0, "entropy/max": 0.3681640625, "entropy/mean": 0.2119140625, "entropy/min": 0.135986328125, "epoch": 0.742393509127789, "frac_reward_zero_std": 0.0, "grad_norm": 0.10979513078927994, "learning_rate": 1.5800812157724081e-07, "loss": -0.016655167564749718, "reward": 1.8152117729187012, "reward_std": 0.2026771605014801, "rewards/DiagnosisAccuracyORM/mean": 0.552480161190033, "rewards/DiagnosisAccuracyORM/std": 0.2894662320613861, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.26273149251937866, "rewards/KeyDiagnosticEvidenceORM/std": 0.11728675663471222, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/mean_length": 804.9375, "completions/min_length": 669.0, "entropy/max": 0.26220703125, "entropy/mean": 0.18896484375, "entropy/min": 0.120849609375, "epoch": 0.744421906693712, "frac_reward_zero_std": 0.0, "grad_norm": 0.12142083793878555, "learning_rate": 1.556670860145567e-07, "loss": -0.0005619823932647705, "reward": 1.7736855745315552, "reward_std": 0.20728275179862976, "rewards/DiagnosisAccuracyORM/mean": 0.4733382761478424, "rewards/DiagnosisAccuracyORM/std": 0.29538193345069885, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3003472089767456, "rewards/KeyDiagnosticEvidenceORM/std": 0.13261927664279938, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/mean_length": 802.75, "completions/min_length": 705.0, "entropy/max": 0.34375, "entropy/mean": 0.203125, "entropy/min": 0.12890625, "epoch": 0.7464503042596349, "frac_reward_zero_std": 0.0, "grad_norm": 0.14326684176921844, "learning_rate": 1.5334032088449466e-07, "loss": -0.0022268842440098524, "reward": 1.7177084684371948, "reward_std": 0.23208323121070862, "rewards/DiagnosisAccuracyORM/mean": 0.37222227454185486, "rewards/DiagnosisAccuracyORM/std": 0.2880260646343231, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3454861342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.10825318098068237, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/mean_length": 807.875, "completions/min_length": 649.0, "entropy/max": 0.3310546875, "entropy/mean": 0.197265625, "entropy/min": 0.1455078125, "epoch": 0.7484787018255578, "frac_reward_zero_std": 0.0, "grad_norm": 0.11391477286815643, "learning_rate": 1.510279226168281e-07, "loss": -0.0015782354166731238, "reward": 1.751273274421692, "reward_std": 0.2685990333557129, "rewards/DiagnosisAccuracyORM/mean": 0.4642361104488373, "rewards/DiagnosisAccuracyORM/std": 0.3167690932750702, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28703704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.09421355277299881, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/mean_length": 799.5833740234375, "completions/min_length": 675.0, "entropy/max": 0.3203125, "entropy/mean": 0.1865234375, "entropy/min": 0.13427734375, "epoch": 0.7505070993914807, "frac_reward_zero_std": 0.0, "grad_norm": 0.11792805045843124, "learning_rate": 1.487299870459155e-07, "loss": -0.007970042526721954, "reward": 1.7957175970077515, "reward_std": 0.2640365958213806, "rewards/DiagnosisAccuracyORM/mean": 0.4670139253139496, "rewards/DiagnosisAccuracyORM/std": 0.22522616386413574, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32870370149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.10012686997652054, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/mean_length": 841.0625, "completions/min_length": 729.0, "entropy/max": 0.3564453125, "entropy/mean": 0.21630859375, "entropy/min": 0.12939453125, "epoch": 0.7525354969574036, "frac_reward_zero_std": 0.0, "grad_norm": 0.12556853890419006, "learning_rate": 1.4644660940672627e-07, "loss": -0.002922689076513052, "reward": 1.8643519878387451, "reward_std": 0.23462416231632233, "rewards/DiagnosisAccuracyORM/mean": 0.4864583909511566, "rewards/DiagnosisAccuracyORM/std": 0.28277283906936646, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3778935372829437, "rewards/KeyDiagnosticEvidenceORM/std": 0.1168353483080864, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/mean_length": 817.7083740234375, "completions/min_length": 688.0, "entropy/max": 0.3193359375, "entropy/mean": 0.2099609375, "entropy/min": 0.130859375, "epoch": 0.7545638945233266, "frac_reward_zero_std": 0.0, "grad_norm": 0.1284826546907425, "learning_rate": 1.4417788433089595e-07, "loss": -0.005723678506910801, "reward": 1.8434854745864868, "reward_std": 0.16947662830352783, "rewards/DiagnosisAccuracyORM/mean": 0.51906418800354, "rewards/DiagnosisAccuracyORM/std": 0.2011955976486206, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3244212865829468, "rewards/KeyDiagnosticEvidenceORM/std": 0.13893729448318481, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/mean_length": 832.5833740234375, "completions/min_length": 655.0, "entropy/max": 0.3017578125, "entropy/mean": 0.21435546875, "entropy/min": 0.15380859375, "epoch": 0.7565922920892495, "frac_reward_zero_std": 0.0, "grad_norm": 0.1424219310283661, "learning_rate": 1.4192390584280344e-07, "loss": 0.012721601873636246, "reward": 1.864914059638977, "reward_std": 0.23935166001319885, "rewards/DiagnosisAccuracyORM/mean": 0.5240575671195984, "rewards/DiagnosisAccuracyORM/std": 0.29769957065582275, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34085652232170105, "rewards/KeyDiagnosticEvidenceORM/std": 0.11427809298038483, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/mean_length": 808.25, "completions/min_length": 674.0, "entropy/max": 0.39453125, "entropy/mean": 0.22998046875, "entropy/min": 0.1416015625, "epoch": 0.7586206896551724, "frac_reward_zero_std": 0.0, "grad_norm": 0.1574830710887909, "learning_rate": 1.3968476735567392e-07, "loss": 0.003284213598817587, "reward": 1.7620866298675537, "reward_std": 0.24578310549259186, "rewards/DiagnosisAccuracyORM/mean": 0.4148644208908081, "rewards/DiagnosisAccuracyORM/std": 0.2554991543292999, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3472222089767456, "rewards/KeyDiagnosticEvidenceORM/std": 0.15354150533676147, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/mean_length": 814.9583740234375, "completions/min_length": 626.0, "entropy/max": 0.3291015625, "entropy/mean": 0.201171875, "entropy/min": 0.14404296875, "epoch": 0.7606490872210954, "frac_reward_zero_std": 0.0, "grad_norm": 0.14641748368740082, "learning_rate": 1.374605616677087e-07, "loss": 0.013947638683021069, "reward": 1.6816883087158203, "reward_std": 0.23675259947776794, "rewards/DiagnosisAccuracyORM/mean": 0.38712796568870544, "rewards/DiagnosisAccuracyORM/std": 0.29177674651145935, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29456019401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.12758229672908783, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/mean_length": 812.3125, "completions/min_length": 582.0, "entropy/max": 0.3193359375, "entropy/mean": 0.201171875, "entropy/min": 0.1396484375, "epoch": 0.7626774847870182, "frac_reward_zero_std": 0.0, "grad_norm": 0.11813350766897202, "learning_rate": 1.3525138095823768e-07, "loss": -0.0012726932764053345, "reward": 1.7167162895202637, "reward_std": 0.15166476368904114, "rewards/DiagnosisAccuracyORM/mean": 0.4198412597179413, "rewards/DiagnosisAccuracyORM/std": 0.3040122985839844, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.296875, "rewards/KeyDiagnosticEvidenceORM/std": 0.13081198930740356, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/mean_length": 813.0, "completions/min_length": 597.0, "entropy/max": 0.28125, "entropy/mean": 0.197265625, "entropy/min": 0.13134765625, "epoch": 0.7647058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 0.12153849750757217, "learning_rate": 1.3305731678390046e-07, "loss": -0.007375722285360098, "reward": 1.7565972805023193, "reward_std": 0.2291254699230194, "rewards/DiagnosisAccuracyORM/mean": 0.41111111640930176, "rewards/DiagnosisAccuracyORM/std": 0.3314010202884674, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3454861342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.1440422385931015, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/mean_length": 831.3333740234375, "completions/min_length": 745.0, "entropy/max": 0.3544921875, "entropy/mean": 0.22509765625, "entropy/min": 0.15185546875, "epoch": 0.7667342799188641, "frac_reward_zero_std": 0.0, "grad_norm": 0.1310758739709854, "learning_rate": 1.308784600748513e-07, "loss": 0.0072730425745248795, "reward": 1.7586805820465088, "reward_std": 0.2265482395887375, "rewards/DiagnosisAccuracyORM/mean": 0.4427083432674408, "rewards/DiagnosisAccuracyORM/std": 0.26970577239990234, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3159722089767456, "rewards/KeyDiagnosticEvidenceORM/std": 0.12163854390382767, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/mean_length": 807.1666870117188, "completions/min_length": 665.0, "entropy/max": 0.24951171875, "entropy/mean": 0.19091796875, "entropy/min": 0.13525390625, "epoch": 0.768762677484787, "frac_reward_zero_std": 0.0, "grad_norm": 0.11777792125940323, "learning_rate": 1.2871490113099064e-07, "loss": 0.002495221793651581, "reward": 1.887549638748169, "reward_std": 0.1510644257068634, "rewards/DiagnosisAccuracyORM/mean": 0.5715773701667786, "rewards/DiagnosisAccuracyORM/std": 0.30561137199401855, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3159722089767456, "rewards/KeyDiagnosticEvidenceORM/std": 0.10911700129508972, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/mean_length": 797.9791870117188, "completions/min_length": 635.0, "entropy/max": 0.34326171875, "entropy/mean": 0.1953125, "entropy/min": 0.13623046875, "epoch": 0.77079107505071, "frac_reward_zero_std": 0.0, "grad_norm": 0.12364068627357483, "learning_rate": 1.2656672961822285e-07, "loss": -0.015642326325178146, "reward": 1.6853257417678833, "reward_std": 0.25769275426864624, "rewards/DiagnosisAccuracyORM/mean": 0.40349701046943665, "rewards/DiagnosisAccuracyORM/std": 0.29429343342781067, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28182870149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.13253672420978546, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/mean_length": 819.7083740234375, "completions/min_length": 696.0, "entropy/max": 0.369140625, "entropy/mean": 0.20947265625, "entropy/min": 0.15087890625, "epoch": 0.7728194726166329, "frac_reward_zero_std": 0.0, "grad_norm": 0.14459165930747986, "learning_rate": 1.2443403456474016e-07, "loss": -0.0007871116395108402, "reward": 1.9189815521240234, "reward_std": 0.25821277499198914, "rewards/DiagnosisAccuracyORM/mean": 0.5277777910232544, "rewards/DiagnosisAccuracyORM/std": 0.3445512056350708, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.39120373129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.11838392913341522, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/mean_length": 819.2291870117188, "completions/min_length": 664.0, "entropy/max": 0.3251953125, "entropy/mean": 0.19482421875, "entropy/min": 0.134765625, "epoch": 0.7748478701825557, "frac_reward_zero_std": 0.0, "grad_norm": 0.12341926246881485, "learning_rate": 1.223169043573325e-07, "loss": 0.009894895367324352, "reward": 1.7859623432159424, "reward_std": 0.19204679131507874, "rewards/DiagnosisAccuracyORM/mean": 0.4717261791229248, "rewards/DiagnosisAccuracyORM/std": 0.21211875975131989, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3142361342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.10009783506393433, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/mean_length": 824.125, "completions/min_length": 693.0, "entropy/max": 0.3388671875, "entropy/mean": 0.20947265625, "entropy/min": 0.13720703125, "epoch": 0.7768762677484787, "frac_reward_zero_std": 0.0, "grad_norm": 0.14434483647346497, "learning_rate": 1.2021542673772584e-07, "loss": 0.014722881838679314, "reward": 1.67875337600708, "reward_std": 0.20330147445201874, "rewards/DiagnosisAccuracyORM/mean": 0.35352182388305664, "rewards/DiagnosisAccuracyORM/std": 0.2795829772949219, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3252314627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.11169443279504776, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/mean_length": 820.9791870117188, "completions/min_length": 701.0, "entropy/max": 0.3271484375, "entropy/mean": 0.21875, "entropy/min": 0.1494140625, "epoch": 0.7789046653144016, "frac_reward_zero_std": 0.0, "grad_norm": 0.13034185767173767, "learning_rate": 1.1812968879894386e-07, "loss": 0.013247326016426086, "reward": 1.6723545789718628, "reward_std": 0.18543675541877747, "rewards/DiagnosisAccuracyORM/mean": 0.38878968358039856, "rewards/DiagnosisAccuracyORM/std": 0.2441193163394928, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28356480598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.10931742936372757, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/mean_length": 841.6458740234375, "completions/min_length": 668.0, "entropy/max": 0.322265625, "entropy/mean": 0.220703125, "entropy/min": 0.12744140625, "epoch": 0.7809330628803245, "frac_reward_zero_std": 0.0, "grad_norm": 0.13564647734165192, "learning_rate": 1.1605977698169999e-07, "loss": -0.0061184195801615715, "reward": 1.7784970998764038, "reward_std": 0.17125475406646729, "rewards/DiagnosisAccuracyORM/mean": 0.49203869700431824, "rewards/DiagnosisAccuracyORM/std": 0.25468888878822327, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2864583432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.10091456025838852, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/mean_length": 800.4166870117188, "completions/min_length": 669.0, "entropy/max": 0.26416015625, "entropy/mean": 0.1923828125, "entropy/min": 0.1533203125, "epoch": 0.7829614604462475, "frac_reward_zero_std": 0.0, "grad_norm": 0.12805390357971191, "learning_rate": 1.1400577707081466e-07, "loss": 0.00485491007566452, "reward": 1.8784723281860352, "reward_std": 0.18887366354465485, "rewards/DiagnosisAccuracyORM/mean": 0.5572916865348816, "rewards/DiagnosisAccuracyORM/std": 0.2557357847690582, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3211805820465088, "rewards/KeyDiagnosticEvidenceORM/std": 0.14151260256767273, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/mean_length": 800.3333740234375, "completions/min_length": 647.0, "entropy/max": 0.302734375, "entropy/mean": 0.20751953125, "entropy/min": 0.14453125, "epoch": 0.7849898580121704, "frac_reward_zero_std": 0.0, "grad_norm": 0.14926695823669434, "learning_rate": 1.1196777419165926e-07, "loss": 0.0039909956976771355, "reward": 1.9346065521240234, "reward_std": 0.2612247169017792, "rewards/DiagnosisAccuracyORM/mean": 0.5572916865348816, "rewards/DiagnosisAccuracyORM/std": 0.2849434018135071, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.37731480598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.10126812011003494, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/mean_length": 826.3333740234375, "completions/min_length": 668.0, "entropy/max": 0.400390625, "entropy/mean": 0.22216796875, "entropy/min": 0.14599609375, "epoch": 0.7870182555780934, "frac_reward_zero_std": 0.0, "grad_norm": 0.11691329628229141, "learning_rate": 1.0994585280662977e-07, "loss": -0.013467024080455303, "reward": 1.772850513458252, "reward_std": 0.20000898838043213, "rewards/DiagnosisAccuracyORM/mean": 0.4319940507411957, "rewards/DiagnosisAccuracyORM/std": 0.34529203176498413, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3408564627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.13262958824634552, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/mean_length": 812.2708740234375, "completions/min_length": 673.0, "entropy/max": 0.25341796875, "entropy/mean": 0.197265625, "entropy/min": 0.125732421875, "epoch": 0.7890466531440162, "frac_reward_zero_std": 0.0, "grad_norm": 0.12960591912269592, "learning_rate": 1.0794009671164483e-07, "loss": -0.0026159435510635376, "reward": 1.8206350803375244, "reward_std": 0.1952739655971527, "rewards/DiagnosisAccuracyORM/mean": 0.4525793790817261, "rewards/DiagnosisAccuracyORM/std": 0.2707940638065338, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3680555820465088, "rewards/KeyDiagnosticEvidenceORM/std": 0.11792075634002686, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/mean_length": 817.5208740234375, "completions/min_length": 653.0, "entropy/max": 0.23974609375, "entropy/mean": 0.18798828125, "entropy/min": 0.14208984375, "epoch": 0.7910750507099391, "frac_reward_zero_std": 0.0, "grad_norm": 0.13542236387729645, "learning_rate": 1.0595058903267357e-07, "loss": -0.006311188451945782, "reward": 1.782804250717163, "reward_std": 0.20643392205238342, "rewards/DiagnosisAccuracyORM/mean": 0.45410051941871643, "rewards/DiagnosisAccuracyORM/std": 0.27048686146736145, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32870370149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.13220728933811188, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/mean_length": 828.0208740234375, "completions/min_length": 697.0, "entropy/max": 0.4580078125, "entropy/mean": 0.2236328125, "entropy/min": 0.13427734375, "epoch": 0.7931034482758621, "frac_reward_zero_std": 0.0, "grad_norm": 0.13002565503120422, "learning_rate": 1.0397741222229057e-07, "loss": 0.00024854642106220126, "reward": 1.8275463581085205, "reward_std": 0.24747538566589355, "rewards/DiagnosisAccuracyORM/mean": 0.5017361044883728, "rewards/DiagnosisAccuracyORM/std": 0.3440955877304077, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32581019401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.12822405993938446, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/mean_length": 820.3541870117188, "completions/min_length": 670.0, "entropy/max": 0.44921875, "entropy/mean": 0.2265625, "entropy/min": 0.13818359375, "epoch": 0.795131845841785, "frac_reward_zero_std": 0.0, "grad_norm": 0.15584230422973633, "learning_rate": 1.0202064805625882e-07, "loss": 0.028118832036852837, "reward": 1.6907572746276855, "reward_std": 0.31986087560653687, "rewards/DiagnosisAccuracyORM/mean": 0.46159061789512634, "rewards/DiagnosisAccuracyORM/std": 0.27248749136924744, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2291666716337204, "rewards/KeyDiagnosticEvidenceORM/std": 0.14018310606479645, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/mean_length": 809.0833740234375, "completions/min_length": 711.0, "entropy/max": 0.2431640625, "entropy/mean": 0.193359375, "entropy/min": 0.150390625, "epoch": 0.7971602434077079, "frac_reward_zero_std": 0.0, "grad_norm": 0.11330313235521317, "learning_rate": 1.0008037763014033e-07, "loss": 0.013908488675951958, "reward": 1.7594246864318848, "reward_std": 0.24194498360157013, "rewards/DiagnosisAccuracyORM/mean": 0.4590774476528168, "rewards/DiagnosisAccuracyORM/std": 0.29648175835609436, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.300347238779068, "rewards/KeyDiagnosticEvidenceORM/std": 0.14367136359214783, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/mean_length": 807.0833740234375, "completions/min_length": 693.0, "entropy/max": 0.25048828125, "entropy/mean": 0.19140625, "entropy/min": 0.138671875, "epoch": 0.7991886409736308, "frac_reward_zero_std": 0.0, "grad_norm": 0.11552086472511292, "learning_rate": 9.815668135593546e-08, "loss": -0.007729006465524435, "reward": 1.8622355461120605, "reward_std": 0.2090074121952057, "rewards/DiagnosisAccuracyORM/mean": 0.4432539641857147, "rewards/DiagnosisAccuracyORM/std": 0.31042513251304626, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.4189814627170563, "rewards/KeyDiagnosticEvidenceORM/std": 0.11371553689241409, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/mean_length": 784.1875, "completions/min_length": 644.0, "entropy/max": 0.22705078125, "entropy/mean": 0.17822265625, "entropy/min": 0.13037109375, "epoch": 0.8012170385395537, "frac_reward_zero_std": 0.0, "grad_norm": 0.11814888566732407, "learning_rate": 9.624963895874994e-08, "loss": 0.009378130547702312, "reward": 1.8436343669891357, "reward_std": 0.18116408586502075, "rewards/DiagnosisAccuracyORM/mean": 0.48923611640930176, "rewards/DiagnosisAccuracyORM/std": 0.3080574870109558, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35439813137054443, "rewards/KeyDiagnosticEvidenceORM/std": 0.1551622748374939, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 797.4791870117188, "completions/min_length": 639.0, "entropy/max": 0.3837890625, "entropy/mean": 0.20458984375, "entropy/min": 0.14599609375, "epoch": 0.8032454361054767, "frac_reward_zero_std": 0.0, "grad_norm": 0.15092986822128296, "learning_rate": 9.435932947349168e-08, "loss": -0.004388037137687206, "reward": 1.737632393836975, "reward_std": 0.22978302836418152, "rewards/DiagnosisAccuracyORM/mean": 0.49920639395713806, "rewards/DiagnosisAccuracyORM/std": 0.2766980230808258, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.23842592537403107, "rewards/KeyDiagnosticEvidenceORM/std": 0.11921308934688568, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/mean_length": 824.7916870117188, "completions/min_length": 652.0, "entropy/max": 0.26171875, "entropy/mean": 0.20263671875, "entropy/min": 0.15478515625, "epoch": 0.8052738336713996, "frac_reward_zero_std": 0.0, "grad_norm": 0.14462102949619293, "learning_rate": 9.248583124159437e-08, "loss": 0.004724852740764618, "reward": 1.8882441520690918, "reward_std": 0.23638322949409485, "rewards/DiagnosisAccuracyORM/mean": 0.5896329283714294, "rewards/DiagnosisAccuracyORM/std": 0.3152082860469818, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2986111044883728, "rewards/KeyDiagnosticEvidenceORM/std": 0.0894155502319336, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/mean_length": 810.5, "completions/min_length": 670.0, "entropy/max": 0.458984375, "entropy/mean": 0.20654296875, "entropy/min": 0.13525390625, "epoch": 0.8073022312373225, "frac_reward_zero_std": 0.0, "grad_norm": 0.12704789638519287, "learning_rate": 9.062922190777079e-08, "loss": -0.007816231809556484, "reward": 1.4717262983322144, "reward_std": 0.15145935118198395, "rewards/DiagnosisAccuracyORM/mean": 0.2720734179019928, "rewards/DiagnosisAccuracyORM/std": 0.24570943415164948, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.1996527761220932, "rewards/KeyDiagnosticEvidenceORM/std": 0.09769074618816376, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/mean_length": 798.6875, "completions/min_length": 656.0, "entropy/max": 0.3037109375, "entropy/mean": 0.1962890625, "entropy/min": 0.13427734375, "epoch": 0.8093306288032455, "frac_reward_zero_std": 0.0, "grad_norm": 0.13179869949817657, "learning_rate": 8.878957841679541e-08, "loss": -0.0036621689796447754, "reward": 1.9096065759658813, "reward_std": 0.2030489444732666, "rewards/DiagnosisAccuracyORM/mean": 0.5392360687255859, "rewards/DiagnosisAccuracyORM/std": 0.23491081595420837, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.37037038803100586, "rewards/KeyDiagnosticEvidenceORM/std": 0.11705908179283142, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/mean_length": 802.5208740234375, "completions/min_length": 642.0, "entropy/max": 0.44921875, "entropy/mean": 0.20458984375, "entropy/min": 0.134765625, "epoch": 0.8113590263691683, "frac_reward_zero_std": 0.0, "grad_norm": 0.11676429212093353, "learning_rate": 8.696697701031542e-08, "loss": -0.01596897467970848, "reward": 1.7602348327636719, "reward_std": 0.14175938069820404, "rewards/DiagnosisAccuracyORM/mean": 0.48824405670166016, "rewards/DiagnosisAccuracyORM/std": 0.32832664251327515, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27199074625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.1181236207485199, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1059.0, "completions/mean_length": 813.125, "completions/min_length": 644.0, "entropy/max": 0.26953125, "entropy/mean": 0.19384765625, "entropy/min": 0.138671875, "epoch": 0.8133874239350912, "frac_reward_zero_std": 0.0, "grad_norm": 0.10672919452190399, "learning_rate": 8.516149322369054e-08, "loss": -0.003821256337687373, "reward": 1.8784723281860352, "reward_std": 0.309354305267334, "rewards/DiagnosisAccuracyORM/mean": 0.5017361044883728, "rewards/DiagnosisAccuracyORM/std": 0.33983445167541504, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3767361342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.1272171586751938, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/mean_length": 791.875, "completions/min_length": 628.0, "entropy/max": 0.294921875, "entropy/mean": 0.19873046875, "entropy/min": 0.15283203125, "epoch": 0.8154158215010142, "frac_reward_zero_std": 0.0, "grad_norm": 0.14004549384117126, "learning_rate": 8.337320188286317e-08, "loss": 0.006351538002490997, "reward": 1.813425898551941, "reward_std": 0.15804100036621094, "rewards/DiagnosisAccuracyORM/mean": 0.43784722685813904, "rewards/DiagnosisAccuracyORM/std": 0.30584803223609924, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.37557873129844666, "rewards/KeyDiagnosticEvidenceORM/std": 0.10643060505390167, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/mean_length": 820.2708740234375, "completions/min_length": 606.0, "entropy/max": 0.4140625, "entropy/mean": 0.23193359375, "entropy/min": 0.133056640625, "epoch": 0.8174442190669371, "frac_reward_zero_std": 0.0, "grad_norm": 0.11563621461391449, "learning_rate": 8.160217710125661e-08, "loss": 0.0009669786086305976, "reward": 1.772462010383606, "reward_std": 0.2826528549194336, "rewards/DiagnosisAccuracyORM/mean": 0.49526286125183105, "rewards/DiagnosisAccuracyORM/std": 0.3103708326816559, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27719905972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.13746188580989838, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/mean_length": 822.4791870117188, "completions/min_length": 652.0, "entropy/max": 0.47265625, "entropy/mean": 0.23486328125, "entropy/min": 0.13232421875, "epoch": 0.8194726166328601, "frac_reward_zero_std": 0.0, "grad_norm": 0.12953250110149384, "learning_rate": 7.984849227670421e-08, "loss": -0.011569134891033173, "reward": 1.6789517402648926, "reward_std": 0.24158146977424622, "rewards/DiagnosisAccuracyORM/mean": 0.45094242691993713, "rewards/DiagnosisAccuracyORM/std": 0.28926441073417664, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.22800926864147186, "rewards/KeyDiagnosticEvidenceORM/std": 0.1348658949136734, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/mean_length": 797.4791870117188, "completions/min_length": 681.0, "entropy/max": 0.2861328125, "entropy/mean": 0.18994140625, "entropy/min": 0.13818359375, "epoch": 0.821501014198783, "frac_reward_zero_std": 0.0, "grad_norm": 0.13736416399478912, "learning_rate": 7.811222008840718e-08, "loss": 0.015093620866537094, "reward": 1.6306713819503784, "reward_std": 0.18846377730369568, "rewards/DiagnosisAccuracyORM/mean": 0.38645830750465393, "rewards/DiagnosisAccuracyORM/std": 0.2611951231956482, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24421297013759613, "rewards/KeyDiagnosticEvidenceORM/std": 0.1128641739487648, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/mean_length": 820.3958740234375, "completions/min_length": 683.0, "entropy/max": 0.3408203125, "entropy/mean": 0.22314453125, "entropy/min": 0.15966796875, "epoch": 0.8235294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 0.11808151006698608, "learning_rate": 7.639343249392255e-08, "loss": 0.006722161080688238, "reward": 1.9949073791503906, "reward_std": 0.14091326296329498, "rewards/DiagnosisAccuracyORM/mean": 0.6552083492279053, "rewards/DiagnosisAccuracyORM/std": 0.3082636296749115, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33969905972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.11913128942251205, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/mean_length": 816.4583740234375, "completions/min_length": 696.0, "entropy/max": 0.4404296875, "entropy/mean": 0.25830078125, "entropy/min": 0.166015625, "epoch": 0.8255578093306288, "frac_reward_zero_std": 0.0, "grad_norm": 0.14394505321979523, "learning_rate": 7.469220072618094e-08, "loss": 0.01075311005115509, "reward": 1.5194859504699707, "reward_std": 0.20923316478729248, "rewards/DiagnosisAccuracyORM/mean": 0.3163607716560364, "rewards/DiagnosisAccuracyORM/std": 0.24949686229228973, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.203125, "rewards/KeyDiagnosticEvidenceORM/std": 0.1399083435535431, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/mean_length": 825.3333740234375, "completions/min_length": 658.0, "entropy/max": 0.34765625, "entropy/mean": 0.2294921875, "entropy/min": 0.14794921875, "epoch": 0.8275862068965517, "frac_reward_zero_std": 0.0, "grad_norm": 0.1299569308757782, "learning_rate": 7.300859529053421e-08, "loss": 0.008218785747885704, "reward": 1.9196925163269043, "reward_std": 0.2689882218837738, "rewards/DiagnosisAccuracyORM/mean": 0.5568452477455139, "rewards/DiagnosisAccuracyORM/std": 0.2896152138710022, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.362847238779068, "rewards/KeyDiagnosticEvidenceORM/std": 0.13258832693099976, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/mean_length": 800.1875, "completions/min_length": 689.0, "entropy/max": 0.2919921875, "entropy/mean": 0.2021484375, "entropy/min": 0.127197265625, "epoch": 0.8296146044624746, "frac_reward_zero_std": 0.0, "grad_norm": 0.13266420364379883, "learning_rate": 7.13426859618338e-08, "loss": 0.013138681650161743, "reward": 1.8004629611968994, "reward_std": 0.26269492506980896, "rewards/DiagnosisAccuracyORM/mean": 0.550694465637207, "rewards/DiagnosisAccuracyORM/std": 0.3122585713863373, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24976851046085358, "rewards/KeyDiagnosticEvidenceORM/std": 0.10340256989002228, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/mean_length": 829.9791870117188, "completions/min_length": 664.0, "entropy/max": 0.2802734375, "entropy/mean": 0.2119140625, "entropy/min": 0.13916015625, "epoch": 0.8316430020283976, "frac_reward_zero_std": 0.0, "grad_norm": 0.12214292585849762, "learning_rate": 6.969454178153921e-08, "loss": -0.0013571319868788123, "reward": 1.5557210445404053, "reward_std": 0.14245611429214478, "rewards/DiagnosisAccuracyORM/mean": 0.3010912835597992, "rewards/DiagnosisAccuracyORM/std": 0.2598704993724823, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.25462964177131653, "rewards/KeyDiagnosticEvidenceORM/std": 0.0961112305521965, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/mean_length": 839.4791870117188, "completions/min_length": 691.0, "entropy/max": 0.3720703125, "entropy/mean": 0.236328125, "entropy/min": 0.1611328125, "epoch": 0.8336713995943205, "frac_reward_zero_std": 0.0, "grad_norm": 0.14376255869865417, "learning_rate": 6.806423105485576e-08, "loss": 0.0035720914602279663, "reward": 1.839748740196228, "reward_std": 0.26020103693008423, "rewards/DiagnosisAccuracyORM/mean": 0.5503968000411987, "rewards/DiagnosisAccuracyORM/std": 0.34823450446128845, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28935185074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.1361832618713379, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/mean_length": 805.1666870117188, "completions/min_length": 670.0, "entropy/max": 0.3056640625, "entropy/mean": 0.1943359375, "entropy/min": 0.1240234375, "epoch": 0.8356997971602435, "frac_reward_zero_std": 0.0, "grad_norm": 0.12499824166297913, "learning_rate": 6.645182134790467e-08, "loss": 0.002745717763900757, "reward": 1.8017692565917969, "reward_std": 0.22661232948303223, "rewards/DiagnosisAccuracyORM/mean": 0.48348212242126465, "rewards/DiagnosisAccuracyORM/std": 0.3130280077457428, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31828704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.13265666365623474, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/mean_length": 815.875, "completions/min_length": 639.0, "entropy/max": 0.2841796875, "entropy/mean": 0.19482421875, "entropy/min": 0.1201171875, "epoch": 0.8377281947261663, "frac_reward_zero_std": 0.0, "grad_norm": 0.11846628040075302, "learning_rate": 6.485737948492237e-08, "loss": -0.008304417133331299, "reward": 1.8864749670028687, "reward_std": 0.20366327464580536, "rewards/DiagnosisAccuracyORM/mean": 0.545039713382721, "rewards/DiagnosisAccuracyORM/std": 0.22118620574474335, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34143519401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.09891024976968765, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/mean_length": 832.9166870117188, "completions/min_length": 684.0, "entropy/max": 0.2783203125, "entropy/mean": 0.2001953125, "entropy/min": 0.1376953125, "epoch": 0.8397565922920892, "frac_reward_zero_std": 0.0, "grad_norm": 0.11982335150241852, "learning_rate": 6.328097154549145e-08, "loss": -0.005597705952823162, "reward": 1.9113426208496094, "reward_std": 0.28033486008644104, "rewards/DiagnosisAccuracyORM/mean": 0.6000000238418579, "rewards/DiagnosisAccuracyORM/std": 0.2942313551902771, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31134259700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.1235579252243042, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/mean_length": 808.0833740234375, "completions/min_length": 659.0, "entropy/max": 0.318359375, "entropy/mean": 0.1923828125, "entropy/min": 0.124267578125, "epoch": 0.8417849898580122, "frac_reward_zero_std": 0.0, "grad_norm": 0.14426393806934357, "learning_rate": 6.172266286180161e-08, "loss": 9.144346222456079e-06, "reward": 1.6942131519317627, "reward_std": 0.16610953211784363, "rewards/DiagnosisAccuracyORM/mean": 0.382291704416275, "rewards/DiagnosisAccuracyORM/std": 0.26402518153190613, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31192129850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.10060226917266846, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1092.0, "completions/mean_length": 831.375, "completions/min_length": 679.0, "entropy/max": 0.435546875, "entropy/mean": 0.23779296875, "entropy/min": 0.13525390625, "epoch": 0.8438133874239351, "frac_reward_zero_std": 0.0, "grad_norm": 0.13932354748249054, "learning_rate": 6.018251801594231e-08, "loss": -0.0022426857613027096, "reward": 1.5678406953811646, "reward_std": 0.19485168159008026, "rewards/DiagnosisAccuracyORM/mean": 0.3311507999897003, "rewards/DiagnosisAccuracyORM/std": 0.2708880305290222, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.23668980598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.11488702148199081, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/mean_length": 816.4583740234375, "completions/min_length": 659.0, "entropy/max": 0.341796875, "entropy/mean": 0.21484375, "entropy/min": 0.14306640625, "epoch": 0.845841784989858, "frac_reward_zero_std": 0.0, "grad_norm": 0.13878852128982544, "learning_rate": 5.8660600837226235e-08, "loss": -0.013573339208960533, "reward": 1.8850033283233643, "reward_std": 0.3001938462257385, "rewards/DiagnosisAccuracyORM/mean": 0.5649802088737488, "rewards/DiagnosisAccuracyORM/std": 0.30898362398147583, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32002314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.15439341962337494, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/mean_length": 809.4791870117188, "completions/min_length": 697.0, "entropy/max": 0.423828125, "entropy/mean": 0.20849609375, "entropy/min": 0.13818359375, "epoch": 0.847870182555781, "frac_reward_zero_std": 0.0, "grad_norm": 0.12789027392864227, "learning_rate": 5.715697439954431e-08, "loss": -0.017581697553396225, "reward": 1.7661060094833374, "reward_std": 0.2078212946653366, "rewards/DiagnosisAccuracyORM/mean": 0.42119860649108887, "rewards/DiagnosisAccuracyORM/std": 0.22610041499137878, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3449074327945709, "rewards/KeyDiagnosticEvidenceORM/std": 0.14929495751857758, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/mean_length": 835.5416870117188, "completions/min_length": 714.0, "entropy/max": 0.2880859375, "entropy/mean": 0.19189453125, "entropy/min": 0.1337890625, "epoch": 0.8498985801217038, "frac_reward_zero_std": 0.0, "grad_norm": 0.11202064901590347, "learning_rate": 5.567170101875074e-08, "loss": 0.0018336847424507141, "reward": 1.7395833730697632, "reward_std": 0.21346765756607056, "rewards/DiagnosisAccuracyORM/mean": 0.4427083432674408, "rewards/DiagnosisAccuracyORM/std": 0.27585363388061523, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.296875, "rewards/KeyDiagnosticEvidenceORM/std": 0.1477833241224289, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/mean_length": 818.5416870117188, "completions/min_length": 663.0, "entropy/max": 0.3056640625, "entropy/mean": 0.20703125, "entropy/min": 0.125732421875, "epoch": 0.8519269776876268, "frac_reward_zero_std": 0.0, "grad_norm": 0.1321931779384613, "learning_rate": 5.420484225008137e-08, "loss": -0.026697702705860138, "reward": 1.8051422834396362, "reward_std": 0.21568983793258667, "rewards/DiagnosisAccuracyORM/mean": 0.5163690447807312, "rewards/DiagnosisAccuracyORM/std": 0.27405616641044617, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28877314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.07552886754274368, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/mean_length": 820.5208740234375, "completions/min_length": 686.0, "entropy/max": 0.2841796875, "entropy/mean": 0.2060546875, "entropy/min": 0.15087890625, "epoch": 0.8539553752535497, "frac_reward_zero_std": 0.0, "grad_norm": 0.13026894629001617, "learning_rate": 5.2756458885602314e-08, "loss": 0.006725875195115805, "reward": 1.7883102893829346, "reward_std": 0.25706538558006287, "rewards/DiagnosisAccuracyORM/mean": 0.5041666626930237, "rewards/DiagnosisAccuracyORM/std": 0.30115029215812683, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2841435372829437, "rewards/KeyDiagnosticEvidenceORM/std": 0.13719290494918823, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/mean_length": 821.6458740234375, "completions/min_length": 711.0, "entropy/max": 0.2880859375, "entropy/mean": 0.18359375, "entropy/min": 0.12841796875, "epoch": 0.8559837728194726, "frac_reward_zero_std": 0.0, "grad_norm": 0.12320768088102341, "learning_rate": 5.1326610951689944e-08, "loss": -0.008781547658145428, "reward": 1.796643614768982, "reward_std": 0.2556394040584564, "rewards/DiagnosisAccuracyORM/mean": 0.44305554032325745, "rewards/DiagnosisAccuracyORM/std": 0.2825465500354767, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35358795523643494, "rewards/KeyDiagnosticEvidenceORM/std": 0.10214079171419144, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/mean_length": 811.875, "completions/min_length": 683.0, "entropy/max": 0.265625, "entropy/mean": 0.2001953125, "entropy/min": 0.142578125, "epoch": 0.8580121703853956, "frac_reward_zero_std": 0.0, "grad_norm": 0.10545063018798828, "learning_rate": 4.991535770654448e-08, "loss": 0.005879343952983618, "reward": 1.6753472089767456, "reward_std": 0.1963551938533783, "rewards/DiagnosisAccuracyORM/mean": 0.402777761220932, "rewards/DiagnosisAccuracyORM/std": 0.3293616771697998, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2725694477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.12601782381534576, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/mean_length": 810.9166870117188, "completions/min_length": 657.0, "entropy/max": 0.24755859375, "entropy/mean": 0.18017578125, "entropy/min": 0.125732421875, "epoch": 0.8600405679513184, "frac_reward_zero_std": 0.0, "grad_norm": 0.11858199536800385, "learning_rate": 4.8522757637732505e-08, "loss": 0.006483200937509537, "reward": 1.776703119277954, "reward_std": 0.2345944494009018, "rewards/DiagnosisAccuracyORM/mean": 0.39533731341362, "rewards/DiagnosisAccuracyORM/std": 0.3182052969932556, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.38136574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.11655397713184357, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/mean_length": 811.5208740234375, "completions/min_length": 634.0, "entropy/max": 0.3466796875, "entropy/mean": 0.20556640625, "entropy/min": 0.1298828125, "epoch": 0.8620689655172413, "frac_reward_zero_std": 0.0, "grad_norm": 0.1255263239145279, "learning_rate": 4.714886845976429e-08, "loss": -0.005985265132039785, "reward": 1.704398274421692, "reward_std": 0.17161142826080322, "rewards/DiagnosisAccuracyORM/mean": 0.3895833492279053, "rewards/DiagnosisAccuracyORM/std": 0.2019006907939911, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31481480598449707, "rewards/KeyDiagnosticEvidenceORM/std": 0.11789755523204803, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/mean_length": 796.4166870117188, "completions/min_length": 669.0, "entropy/max": 0.24267578125, "entropy/mean": 0.18115234375, "entropy/min": 0.1220703125, "epoch": 0.8640973630831643, "frac_reward_zero_std": 0.0, "grad_norm": 0.1244652196764946, "learning_rate": 4.5793747111701194e-08, "loss": 0.000429786741733551, "reward": 1.8409721851348877, "reward_std": 0.24403175711631775, "rewards/DiagnosisAccuracyORM/mean": 0.49548614025115967, "rewards/DiagnosisAccuracyORM/std": 0.26042652130126953, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3454860746860504, "rewards/KeyDiagnosticEvidenceORM/std": 0.15951007604599, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/mean_length": 807.4583740234375, "completions/min_length": 675.0, "entropy/max": 0.4296875, "entropy/mean": 0.21484375, "entropy/min": 0.13330078125, "epoch": 0.8661257606490872, "frac_reward_zero_std": 0.0, "grad_norm": 0.13334016501903534, "learning_rate": 4.445744975479626e-08, "loss": -0.008540278300642967, "reward": 1.783200979232788, "reward_std": 0.21945777535438538, "rewards/DiagnosisAccuracyORM/mean": 0.4556547701358795, "rewards/DiagnosisAccuracyORM/std": 0.3320569694042206, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32754626870155334, "rewards/KeyDiagnosticEvidenceORM/std": 0.1128641813993454, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/mean_length": 810.8541870117188, "completions/min_length": 616.0, "entropy/max": 0.26416015625, "entropy/mean": 0.18408203125, "entropy/min": 0.128173828125, "epoch": 0.8681541582150102, "frac_reward_zero_std": 0.0, "grad_norm": 0.1395929902791977, "learning_rate": 4.314003177016645e-08, "loss": 0.0027937579434365034, "reward": 2.0531251430511475, "reward_std": 0.1981126368045807, "rewards/DiagnosisAccuracyORM/mean": 0.6364583969116211, "rewards/DiagnosisAccuracyORM/std": 0.22509929537773132, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.4166666567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.13129350543022156, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/mean_length": 842.0416870117188, "completions/min_length": 706.0, "entropy/max": 0.3291015625, "entropy/mean": 0.2197265625, "entropy/min": 0.13525390625, "epoch": 0.8701825557809331, "frac_reward_zero_std": 0.0, "grad_norm": 0.13655486702919006, "learning_rate": 4.1841547756497675e-08, "loss": -0.003393620252609253, "reward": 1.8157572746276855, "reward_std": 0.22389963269233704, "rewards/DiagnosisAccuracyORM/mean": 0.48878970742225647, "rewards/DiagnosisAccuracyORM/std": 0.3097952902317047, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32696759700775146, "rewards/KeyDiagnosticEvidenceORM/std": 0.1864921897649765, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/mean_length": 825.7708740234375, "completions/min_length": 662.0, "entropy/max": 0.2861328125, "entropy/mean": 0.20654296875, "entropy/min": 0.14111328125, "epoch": 0.8722109533468559, "frac_reward_zero_std": 0.0, "grad_norm": 0.11972443759441376, "learning_rate": 4.056205152778153e-08, "loss": -0.02352336049079895, "reward": 1.882986068725586, "reward_std": 0.20590293407440186, "rewards/DiagnosisAccuracyORM/mean": 0.5444444417953491, "rewards/DiagnosisAccuracyORM/std": 0.30797773599624634, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3385416567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.16112296283245087, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/mean_length": 805.0, "completions/min_length": 645.0, "entropy/max": 0.4072265625, "entropy/mean": 0.21630859375, "entropy/min": 0.15625, "epoch": 0.8742393509127789, "frac_reward_zero_std": 0.0, "grad_norm": 0.10046442598104477, "learning_rate": 3.930159611108602e-08, "loss": -0.01673947088420391, "reward": 1.6415178775787354, "reward_std": 0.22174574434757233, "rewards/DiagnosisAccuracyORM/mean": 0.36894842982292175, "rewards/DiagnosisAccuracyORM/std": 0.27262261509895325, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2725694477558136, "rewards/KeyDiagnosticEvidenceORM/std": 0.11569394916296005, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/mean_length": 813.375, "completions/min_length": 695.0, "entropy/max": 0.24462890625, "entropy/mean": 0.18798828125, "entropy/min": 0.127685546875, "epoch": 0.8762677484787018, "frac_reward_zero_std": 0.0, "grad_norm": 0.10665903240442276, "learning_rate": 3.806023374435663e-08, "loss": -0.0029766946099698544, "reward": 1.834962010383606, "reward_std": 0.17822833359241486, "rewards/DiagnosisAccuracyORM/mean": 0.5056796073913574, "rewards/DiagnosisAccuracyORM/std": 0.24478571116924286, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3292824327945709, "rewards/KeyDiagnosticEvidenceORM/std": 0.11301407963037491, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/mean_length": 817.2291870117188, "completions/min_length": 637.0, "entropy/max": 0.287109375, "entropy/mean": 0.2119140625, "entropy/min": 0.15380859375, "epoch": 0.8782961460446247, "frac_reward_zero_std": 0.0, "grad_norm": 0.1239096075296402, "learning_rate": 3.683801587425251e-08, "loss": 0.00023549734032712877, "reward": 1.5482308864593506, "reward_std": 0.16506017744541168, "rewards/DiagnosisAccuracyORM/mean": 0.2606150805950165, "rewards/DiagnosisAccuracyORM/std": 0.2612578272819519, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28761574625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.11438578367233276, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/mean_length": 827.1875, "completions/min_length": 681.0, "entropy/max": 0.271484375, "entropy/mean": 0.19482421875, "entropy/min": 0.130126953125, "epoch": 0.8803245436105477, "frac_reward_zero_std": 0.0, "grad_norm": 0.12819865345954895, "learning_rate": 3.563499315401386e-08, "loss": 0.0008506948943249881, "reward": 2.0420141220092773, "reward_std": 0.2124052196741104, "rewards/DiagnosisAccuracyORM/mean": 0.6826388835906982, "rewards/DiagnosisAccuracyORM/std": 0.266288697719574, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.359375, "rewards/KeyDiagnosticEvidenceORM/std": 0.12467534095048904, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/mean_length": 821.0, "completions/min_length": 610.0, "entropy/max": 0.359375, "entropy/mean": 0.21533203125, "entropy/min": 0.1357421875, "epoch": 0.8823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.12158717960119247, "learning_rate": 3.445121544136226e-08, "loss": -0.00017904739070218056, "reward": 1.6964948177337646, "reward_std": 0.11965170502662659, "rewards/DiagnosisAccuracyORM/mean": 0.45054563879966736, "rewards/DiagnosisAccuracyORM/std": 0.1926172971725464, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.24594907462596893, "rewards/KeyDiagnosticEvidenceORM/std": 0.10518935322761536, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/mean_length": 806.4583740234375, "completions/min_length": 680.0, "entropy/max": 0.2822265625, "entropy/mean": 0.193359375, "entropy/min": 0.1376953125, "epoch": 0.8843813387423936, "frac_reward_zero_std": 0.0, "grad_norm": 0.12952378392219543, "learning_rate": 3.328673179643554e-08, "loss": -0.0015987008810043335, "reward": 1.9059028625488281, "reward_std": 0.24115517735481262, "rewards/DiagnosisAccuracyORM/mean": 0.6072916984558105, "rewards/DiagnosisAccuracyORM/std": 0.30364036560058594, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2986111342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.11708244681358337, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/mean_length": 813.8125, "completions/min_length": 673.0, "entropy/max": 0.3466796875, "entropy/mean": 0.201171875, "entropy/min": 0.13818359375, "epoch": 0.8864097363083164, "frac_reward_zero_std": 0.0, "grad_norm": 0.12337176501750946, "learning_rate": 3.2141590479753234e-08, "loss": 0.0034095249138772488, "reward": 1.8157904148101807, "reward_std": 0.18885910511016846, "rewards/DiagnosisAccuracyORM/mean": 0.41011905670166016, "rewards/DiagnosisAccuracyORM/std": 0.2601296305656433, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.4056713283061981, "rewards/KeyDiagnosticEvidenceORM/std": 0.1539408266544342, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/mean_length": 830.0625, "completions/min_length": 668.0, "entropy/max": 0.3427734375, "entropy/mean": 0.21826171875, "entropy/min": 0.15087890625, "epoch": 0.8884381338742393, "frac_reward_zero_std": 0.0, "grad_norm": 0.12476707249879837, "learning_rate": 3.101583895021731e-08, "loss": -0.004888281226158142, "reward": 1.917708396911621, "reward_std": 0.21476486325263977, "rewards/DiagnosisAccuracyORM/mean": 0.5756944417953491, "rewards/DiagnosisAccuracyORM/std": 0.2849237620830536, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3420139253139496, "rewards/KeyDiagnosticEvidenceORM/std": 0.1305607408285141, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/mean_length": 800.3333740234375, "completions/min_length": 666.0, "entropy/max": 0.294921875, "entropy/mean": 0.21337890625, "entropy/min": 0.15087890625, "epoch": 0.8904665314401623, "frac_reward_zero_std": 0.0, "grad_norm": 0.13502554595470428, "learning_rate": 2.990952386314505e-08, "loss": -0.007126680575311184, "reward": 1.8562500476837158, "reward_std": 0.27310433983802795, "rewards/DiagnosisAccuracyORM/mean": 0.535069465637207, "rewards/DiagnosisAccuracyORM/std": 0.2678219676017761, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3211805522441864, "rewards/KeyDiagnosticEvidenceORM/std": 0.13252639770507812, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/mean_length": 821.125, "completions/min_length": 661.0, "entropy/max": 0.2822265625, "entropy/mean": 0.1875, "entropy/min": 0.133056640625, "epoch": 0.8924949290060852, "frac_reward_zero_std": 0.0, "grad_norm": 0.10413457453250885, "learning_rate": 2.8822691068335515e-08, "loss": 0.012881940230727196, "reward": 1.710400104522705, "reward_std": 0.2393883913755417, "rewards/DiagnosisAccuracyORM/mean": 0.42266860604286194, "rewards/DiagnosisAccuracyORM/std": 0.26404258608818054, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28773149847984314, "rewards/KeyDiagnosticEvidenceORM/std": 0.10459598153829575, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/mean_length": 808.5208740234375, "completions/min_length": 584.0, "entropy/max": 0.37890625, "entropy/mean": 0.2138671875, "entropy/min": 0.13427734375, "epoch": 0.8945233265720081, "frac_reward_zero_std": 0.0, "grad_norm": 0.11103492230176926, "learning_rate": 2.7755385608169368e-08, "loss": 0.007331653498113155, "reward": 1.453125, "reward_std": 0.15439516305923462, "rewards/DiagnosisAccuracyORM/mean": 0.2309027910232544, "rewards/DiagnosisAccuracyORM/std": 0.2195253223180771, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2222222238779068, "rewards/KeyDiagnosticEvidenceORM/std": 0.15354150533676147, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/mean_length": 825.375, "completions/min_length": 648.0, "entropy/max": 0.3095703125, "entropy/mean": 0.20654296875, "entropy/min": 0.14599609375, "epoch": 0.896551724137931, "frac_reward_zero_std": 0.0, "grad_norm": 0.1275726854801178, "learning_rate": 2.6707651715742074e-08, "loss": -0.011725731194019318, "reward": 1.8295221328735352, "reward_std": 0.2769942283630371, "rewards/DiagnosisAccuracyORM/mean": 0.527901828289032, "rewards/DiagnosisAccuracyORM/std": 0.33037155866622925, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30162036418914795, "rewards/KeyDiagnosticEvidenceORM/std": 0.15818430483341217, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/mean_length": 796.6458740234375, "completions/min_length": 668.0, "entropy/max": 0.2685546875, "entropy/mean": 0.19384765625, "entropy/min": 0.12890625, "epoch": 0.8985801217038539, "frac_reward_zero_std": 0.0, "grad_norm": 0.12564204633235931, "learning_rate": 2.5679532813030592e-08, "loss": 0.005330778658390045, "reward": 1.9041666984558105, "reward_std": 0.1703633964061737, "rewards/DiagnosisAccuracyORM/mean": 0.534375011920929, "rewards/DiagnosisAccuracyORM/std": 0.25772708654403687, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3697916567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.10874493420124054, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/mean_length": 810.1666870117188, "completions/min_length": 651.0, "entropy/max": 0.2607421875, "entropy/mean": 0.1943359375, "entropy/min": 0.125732421875, "epoch": 0.9006085192697769, "frac_reward_zero_std": 0.0, "grad_norm": 0.11660079658031464, "learning_rate": 2.4671071509094365e-08, "loss": 0.0006911928649060428, "reward": 1.9730324745178223, "reward_std": 0.14074793457984924, "rewards/DiagnosisAccuracyORM/mean": 0.6489583849906921, "rewards/DiagnosisAccuracyORM/std": 0.36146220564842224, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32407405972480774, "rewards/KeyDiagnosticEvidenceORM/std": 0.1214640885591507, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/mean_length": 824.2291870117188, "completions/min_length": 644.0, "entropy/max": 0.4130859375, "entropy/mean": 0.21240234375, "entropy/min": 0.1328125, "epoch": 0.9026369168356998, "frac_reward_zero_std": 0.0, "grad_norm": 0.11533091217279434, "learning_rate": 2.3682309598308746e-08, "loss": -0.01626177504658699, "reward": 1.7571427822113037, "reward_std": 0.22734877467155457, "rewards/DiagnosisAccuracyORM/mean": 0.41223546862602234, "rewards/DiagnosisAccuracyORM/std": 0.28091496229171753, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34490740299224854, "rewards/KeyDiagnosticEvidenceORM/std": 0.1467439830303192, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/mean_length": 819.375, "completions/min_length": 700.0, "entropy/max": 0.359375, "entropy/mean": 0.22021484375, "entropy/min": 0.15673828125, "epoch": 0.9046653144016227, "frac_reward_zero_std": 0.0, "grad_norm": 0.1420605331659317, "learning_rate": 2.2713288058633317e-08, "loss": 0.008946518413722515, "reward": 1.9000000953674316, "reward_std": 0.2366221845149994, "rewards/DiagnosisAccuracyORM/mean": 0.5736111402511597, "rewards/DiagnosisAccuracyORM/std": 0.30338579416275024, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3263888657093048, "rewards/KeyDiagnosticEvidenceORM/std": 0.142621248960495, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/mean_length": 802.0, "completions/min_length": 689.0, "entropy/max": 0.24365234375, "entropy/mean": 0.18408203125, "entropy/min": 0.13232421875, "epoch": 0.9066937119675457, "frac_reward_zero_std": 0.0, "grad_norm": 0.12419182807207108, "learning_rate": 2.1764047049913525e-08, "loss": 0.006442192941904068, "reward": 1.7306714057922363, "reward_std": 0.23956722021102905, "rewards/DiagnosisAccuracyORM/mean": 0.4552083909511566, "rewards/DiagnosisAccuracyORM/std": 0.2782747447490692, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.27546295523643494, "rewards/KeyDiagnosticEvidenceORM/std": 0.11371554434299469, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/mean_length": 800.7083740234375, "completions/min_length": 653.0, "entropy/max": 0.2705078125, "entropy/mean": 0.1953125, "entropy/min": 0.14111328125, "epoch": 0.9087221095334685, "frac_reward_zero_std": 0.0, "grad_norm": 0.13393822312355042, "learning_rate": 2.083462591221613e-08, "loss": -0.009520964697003365, "reward": 1.7119214534759521, "reward_std": 0.2237590253353119, "rewards/DiagnosisAccuracyORM/mean": 0.3982639014720917, "rewards/DiagnosisAccuracyORM/std": 0.33695268630981445, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.31365740299224854, "rewards/KeyDiagnosticEvidenceORM/std": 0.1168660819530487, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 813.2916870117188, "completions/min_length": 675.0, "entropy/max": 0.2958984375, "entropy/mean": 0.197265625, "entropy/min": 0.1298828125, "epoch": 0.9107505070993914, "frac_reward_zero_std": 0.0, "grad_norm": 0.10112005472183228, "learning_rate": 1.992506316419912e-08, "loss": -0.00446745753288269, "reward": 1.8068287372589111, "reward_std": 0.17667275667190552, "rewards/DiagnosisAccuracyORM/mean": 0.5197916626930237, "rewards/DiagnosisAccuracyORM/std": 0.28078025579452515, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28703704476356506, "rewards/KeyDiagnosticEvidenceORM/std": 0.1159316822886467, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/mean_length": 809.4375, "completions/min_length": 643.0, "entropy/max": 0.2412109375, "entropy/mean": 0.17333984375, "entropy/min": 0.1328125, "epoch": 0.9127789046653144, "frac_reward_zero_std": 0.0, "grad_norm": 0.1112411692738533, "learning_rate": 1.9035396501515144e-08, "loss": 0.004743516445159912, "reward": 1.816319465637207, "reward_std": 0.20510652661323547, "rewards/DiagnosisAccuracyORM/mean": 0.45347222685813904, "rewards/DiagnosisAccuracyORM/std": 0.3190736770629883, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3628472089767456, "rewards/KeyDiagnosticEvidenceORM/std": 0.13649307191371918, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/mean_length": 807.2916870117188, "completions/min_length": 652.0, "entropy/max": 0.4892578125, "entropy/mean": 0.23193359375, "entropy/min": 0.12890625, "epoch": 0.9148073022312373, "frac_reward_zero_std": 0.0, "grad_norm": 0.12683357298374176, "learning_rate": 1.816566279524917e-08, "loss": -0.0013251106720417738, "reward": 1.5665178298950195, "reward_std": 0.2935318350791931, "rewards/DiagnosisAccuracyORM/mean": 0.3269345164299011, "rewards/DiagnosisAccuracyORM/std": 0.2965775728225708, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2395833283662796, "rewards/KeyDiagnosticEvidenceORM/std": 0.1367596834897995, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/mean_length": 797.4583740234375, "completions/min_length": 653.0, "entropy/max": 0.275390625, "entropy/mean": 0.201171875, "entropy/min": 0.13623046875, "epoch": 0.9168356997971603, "frac_reward_zero_std": 0.0, "grad_norm": 0.12261029332876205, "learning_rate": 1.7315898090390745e-08, "loss": 0.0014655193081125617, "reward": 1.7232969999313354, "reward_std": 0.19285768270492554, "rewards/DiagnosisAccuracyORM/mean": 0.4154266119003296, "rewards/DiagnosisAccuracyORM/std": 0.19001272320747375, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30787035822868347, "rewards/KeyDiagnosticEvidenceORM/std": 0.10869617015123367, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/mean_length": 836.9583740234375, "completions/min_length": 722.0, "entropy/max": 0.447265625, "entropy/mean": 0.26220703125, "entropy/min": 0.19091796875, "epoch": 0.9188640973630832, "frac_reward_zero_std": 0.0, "grad_norm": 0.15459686517715454, "learning_rate": 1.648613760433981e-08, "loss": 0.0034728795289993286, "reward": 1.675512671470642, "reward_std": 0.19500327110290527, "rewards/DiagnosisAccuracyORM/mean": 0.372271865606308, "rewards/DiagnosisAccuracyORM/std": 0.39270204305648804, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30324074625968933, "rewards/KeyDiagnosticEvidenceORM/std": 0.13226936757564545, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/mean_length": 823.8333740234375, "completions/min_length": 693.0, "entropy/max": 0.33984375, "entropy/mean": 0.20361328125, "entropy/min": 0.1357421875, "epoch": 0.920892494929006, "frac_reward_zero_std": 0.0, "grad_norm": 0.1255842000246048, "learning_rate": 1.5676415725447423e-08, "loss": -0.00849081575870514, "reward": 1.8803075551986694, "reward_std": 0.21865227818489075, "rewards/DiagnosisAccuracyORM/mean": 0.5469741821289062, "rewards/DiagnosisAccuracyORM/std": 0.26757457852363586, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3333333432674408, "rewards/KeyDiagnosticEvidenceORM/std": 0.11630869656801224, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/mean_length": 832.2708740234375, "completions/min_length": 745.0, "entropy/max": 0.248046875, "entropy/mean": 0.1875, "entropy/min": 0.13623046875, "epoch": 0.922920892494929, "frac_reward_zero_std": 0.0, "grad_norm": 0.12153751403093338, "learning_rate": 1.4886766011590446e-08, "loss": -0.008543267846107483, "reward": 1.9293651580810547, "reward_std": 0.16165399551391602, "rewards/DiagnosisAccuracyORM/mean": 0.6151289939880371, "rewards/DiagnosisAccuracyORM/std": 0.3007258176803589, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3142361342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.1531299650669098, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/mean_length": 824.25, "completions/min_length": 694.0, "entropy/max": 0.26171875, "entropy/mean": 0.18994140625, "entropy/min": 0.1455078125, "epoch": 0.9249492900608519, "frac_reward_zero_std": 0.0, "grad_norm": 0.11192111670970917, "learning_rate": 1.4117221188780614e-08, "loss": -0.008855953812599182, "reward": 1.9585647583007812, "reward_std": 0.3445439636707306, "rewards/DiagnosisAccuracyORM/mean": 0.590624988079071, "rewards/DiagnosisAccuracyORM/std": 0.348246306180954, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3679398000240326, "rewards/KeyDiagnosticEvidenceORM/std": 0.15966063737869263, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/mean_length": 810.9166870117188, "completions/min_length": 691.0, "entropy/max": 0.2587890625, "entropy/mean": 0.20751953125, "entropy/min": 0.1328125, "epoch": 0.9269776876267748, "frac_reward_zero_std": 0.0, "grad_norm": 0.14151644706726074, "learning_rate": 1.3367813149808726e-08, "loss": 0.004046741873025894, "reward": 1.9571263790130615, "reward_std": 0.232615128159523, "rewards/DiagnosisAccuracyORM/mean": 0.6214781403541565, "rewards/DiagnosisAccuracyORM/std": 0.39463233947753906, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33564814925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.10914833843708038, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/mean_length": 831.875, "completions/min_length": 659.0, "entropy/max": 0.4462890625, "entropy/mean": 0.23046875, "entropy/min": 0.15234375, "epoch": 0.9290060851926978, "frac_reward_zero_std": 0.0, "grad_norm": 0.1524704247713089, "learning_rate": 1.2638572952922477e-08, "loss": -0.016962386667728424, "reward": 1.8931714296340942, "reward_std": 0.2446625828742981, "rewards/DiagnosisAccuracyORM/mean": 0.612500011920929, "rewards/DiagnosisAccuracyORM/std": 0.3241373300552368, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28067129850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.13441245257854462, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/mean_length": 812.5833740234375, "completions/min_length": 577.0, "entropy/max": 0.33203125, "entropy/mean": 0.20068359375, "entropy/min": 0.117431640625, "epoch": 0.9310344827586207, "frac_reward_zero_std": 0.0, "grad_norm": 0.13900771737098694, "learning_rate": 1.192953082053927e-08, "loss": -0.01467536948621273, "reward": 1.6769014596939087, "reward_std": 0.2384307086467743, "rewards/DiagnosisAccuracyORM/mean": 0.36324405670166016, "rewards/DiagnosisAccuracyORM/std": 0.2978505492210388, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3136574327945709, "rewards/KeyDiagnosticEvidenceORM/std": 0.13153813779354095, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/mean_length": 795.6666870117188, "completions/min_length": 659.0, "entropy/max": 0.234375, "entropy/mean": 0.18798828125, "entropy/min": 0.14208984375, "epoch": 0.9330628803245437, "frac_reward_zero_std": 0.0, "grad_norm": 0.12908369302749634, "learning_rate": 1.1240716137994043e-08, "loss": 0.002897439058870077, "reward": 1.6744213104248047, "reward_std": 0.19247551262378693, "rewards/DiagnosisAccuracyORM/mean": 0.38854169845581055, "rewards/DiagnosisAccuracyORM/std": 0.3015750050544739, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28587964177131653, "rewards/KeyDiagnosticEvidenceORM/std": 0.13703949749469757, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/mean_length": 832.0208740234375, "completions/min_length": 663.0, "entropy/max": 0.2919921875, "entropy/mean": 0.21923828125, "entropy/min": 0.16943359375, "epoch": 0.9350912778904665, "frac_reward_zero_std": 0.0, "grad_norm": 0.15417441725730896, "learning_rate": 1.0572157452321095e-08, "loss": 0.010436361655592918, "reward": 1.8745040893554688, "reward_std": 0.1627742052078247, "rewards/DiagnosisAccuracyORM/mean": 0.5758928656578064, "rewards/DiagnosisAccuracyORM/std": 0.3164697289466858, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2986111342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.12819337844848633, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/mean_length": 806.1458740234375, "completions/min_length": 635.0, "entropy/max": 0.3212890625, "entropy/mean": 0.20361328125, "entropy/min": 0.13818359375, "epoch": 0.9371196754563894, "frac_reward_zero_std": 0.0, "grad_norm": 0.1164170578122139, "learning_rate": 9.92388247107112e-09, "loss": -0.008474349975585938, "reward": 1.7151621580123901, "reward_std": 0.15156471729278564, "rewards/DiagnosisAccuracyORM/mean": 0.4072916507720947, "rewards/DiagnosisAccuracyORM/std": 0.24203398823738098, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30787038803100586, "rewards/KeyDiagnosticEvidenceORM/std": 0.0975523516535759, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 822.25, "completions/min_length": 702.0, "entropy/max": 0.4150390625, "entropy/mean": 0.2099609375, "entropy/min": 0.15087890625, "epoch": 0.9391480730223124, "frac_reward_zero_std": 0.0, "grad_norm": 0.13120482861995697, "learning_rate": 9.295918061163033e-09, "loss": -0.0006822521681897342, "reward": 1.713822841644287, "reward_std": 0.22466790676116943, "rewards/DiagnosisAccuracyORM/mean": 0.4366236925125122, "rewards/DiagnosisAccuracyORM/std": 0.22436542809009552, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2771990895271301, "rewards/KeyDiagnosticEvidenceORM/std": 0.07970624417066574, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/mean_length": 809.2291870117188, "completions/min_length": 642.0, "entropy/max": 0.25341796875, "entropy/mean": 0.19091796875, "entropy/min": 0.14013671875, "epoch": 0.9411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.12510637938976288, "learning_rate": 8.688290247770069e-09, "loss": -0.019559506326913834, "reward": 1.9208581447601318, "reward_std": 0.2506689429283142, "rewards/DiagnosisAccuracyORM/mean": 0.5649554133415222, "rewards/DiagnosisAccuracyORM/std": 0.28304487466812134, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.355902761220932, "rewards/KeyDiagnosticEvidenceORM/std": 0.12123719602823257, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/mean_length": 820.8333740234375, "completions/min_length": 695.0, "entropy/max": 0.328125, "entropy/mean": 0.2080078125, "entropy/min": 0.14892578125, "epoch": 0.9432048681541582, "frac_reward_zero_std": 0.0, "grad_norm": 0.12596479058265686, "learning_rate": 8.101024213241825e-09, "loss": 0.0015766248106956482, "reward": 2.019791841506958, "reward_std": 0.2556682229042053, "rewards/DiagnosisAccuracyORM/mean": 0.6916666626930237, "rewards/DiagnosisAccuracyORM/std": 0.2855876088142395, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.328125, "rewards/KeyDiagnosticEvidenceORM/std": 0.20085321366786957, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/mean_length": 817.25, "completions/min_length": 692.0, "entropy/max": 0.2705078125, "entropy/mean": 0.19287109375, "entropy/min": 0.13427734375, "epoch": 0.9452332657200812, "frac_reward_zero_std": 0.0, "grad_norm": 0.12421958893537521, "learning_rate": 7.534144296060142e-09, "loss": -0.0035328667145222425, "reward": 1.789583444595337, "reward_std": 0.20973145961761475, "rewards/DiagnosisAccuracyORM/mean": 0.47534725069999695, "rewards/DiagnosisAccuracyORM/std": 0.2840934693813324, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3142361342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.11376214772462845, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/mean_length": 825.3541870117188, "completions/min_length": 658.0, "entropy/max": 0.2626953125, "entropy/mean": 0.21337890625, "entropy/min": 0.16162109375, "epoch": 0.947261663286004, "frac_reward_zero_std": 0.0, "grad_norm": 0.14131459593772888, "learning_rate": 6.987673989830522e-09, "loss": 0.01903238520026207, "reward": 1.7284889221191406, "reward_std": 0.19814343750476837, "rewards/DiagnosisAccuracyORM/mean": 0.43219247460365295, "rewards/DiagnosisAccuracyORM/std": 0.24976180493831635, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29629629850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.14623966813087463, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/mean_length": 833.5625, "completions/min_length": 691.0, "entropy/max": 0.337890625, "entropy/mean": 0.21240234375, "entropy/min": 0.15576171875, "epoch": 0.949290060851927, "frac_reward_zero_std": 0.0, "grad_norm": 0.1250298023223877, "learning_rate": 6.4616359423086406e-09, "loss": -0.0034934580326080322, "reward": 1.7686922550201416, "reward_std": 0.26173311471939087, "rewards/DiagnosisAccuracyORM/mean": 0.4799189865589142, "rewards/DiagnosisAccuracyORM/std": 0.31964045763015747, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28877314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.1485128551721573, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/mean_length": 819.3541870117188, "completions/min_length": 618.0, "entropy/max": 0.3310546875, "entropy/mean": 0.21240234375, "entropy/min": 0.12646484375, "epoch": 0.9513184584178499, "frac_reward_zero_std": 0.0, "grad_norm": 0.12821908295154572, "learning_rate": 5.956051954461472e-09, "loss": 0.0014817392220720649, "reward": 1.8485450744628906, "reward_std": 0.21263229846954346, "rewards/DiagnosisAccuracyORM/mean": 0.4851190745830536, "rewards/DiagnosisAccuracyORM/std": 0.25955772399902344, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.36342594027519226, "rewards/KeyDiagnosticEvidenceORM/std": 0.1890794187784195, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/mean_length": 816.375, "completions/min_length": 668.0, "entropy/max": 0.2919921875, "entropy/mean": 0.201171875, "entropy/min": 0.133056640625, "epoch": 0.9533468559837728, "frac_reward_zero_std": 0.0, "grad_norm": 0.12598957121372223, "learning_rate": 5.47094297956402e-09, "loss": 0.010602787137031555, "reward": 1.782060146331787, "reward_std": 0.22873413562774658, "rewards/DiagnosisAccuracyORM/mean": 0.43020835518836975, "rewards/DiagnosisAccuracyORM/std": 0.23185083270072937, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35185185074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.17986851930618286, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/mean_length": 823.1458740234375, "completions/min_length": 687.0, "entropy/max": 0.31640625, "entropy/mean": 0.20166015625, "entropy/min": 0.1376953125, "epoch": 0.9553752535496958, "frac_reward_zero_std": 0.0, "grad_norm": 0.13657647371292114, "learning_rate": 5.006329122330899e-09, "loss": -0.0013448446989059448, "reward": 1.867708444595337, "reward_std": 0.22337917983531952, "rewards/DiagnosisAccuracyORM/mean": 0.5065972208976746, "rewards/DiagnosisAccuracyORM/std": 0.28682124614715576, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3611110746860504, "rewards/KeyDiagnosticEvidenceORM/std": 0.13835595548152924, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/mean_length": 803.3125, "completions/min_length": 640.0, "entropy/max": 0.337890625, "entropy/mean": 0.2001953125, "entropy/min": 0.1513671875, "epoch": 0.9574036511156186, "frac_reward_zero_std": 0.0, "grad_norm": 0.13339203596115112, "learning_rate": 4.562229638082893e-09, "loss": -0.004424243234097958, "reward": 1.7494213581085205, "reward_std": 0.22179186344146729, "rewards/DiagnosisAccuracyORM/mean": 0.4496527910232544, "rewards/DiagnosisAccuracyORM/std": 0.28687790036201477, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2997685372829437, "rewards/KeyDiagnosticEvidenceORM/std": 0.10169278085231781, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/mean_length": 830.9375, "completions/min_length": 715.0, "entropy/max": 0.3447265625, "entropy/mean": 0.2119140625, "entropy/min": 0.13818359375, "epoch": 0.9594320486815415, "frac_reward_zero_std": 0.0, "grad_norm": 0.12550371885299683, "learning_rate": 4.138662931949255e-09, "loss": 0.002487321849912405, "reward": 1.8473875522613525, "reward_std": 0.217189759016037, "rewards/DiagnosisAccuracyORM/mean": 0.5146329402923584, "rewards/DiagnosisAccuracyORM/std": 0.313353955745697, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.33275464177131653, "rewards/KeyDiagnosticEvidenceORM/std": 0.14636941254138947, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/mean_length": 796.6666870117188, "completions/min_length": 655.0, "entropy/max": 0.26806640625, "entropy/mean": 0.1943359375, "entropy/min": 0.131103515625, "epoch": 0.9614604462474645, "frac_reward_zero_std": 0.0, "grad_norm": 0.13228075206279755, "learning_rate": 3.73564655810471e-09, "loss": -0.0017241587629541755, "reward": 1.8791667222976685, "reward_std": 0.22190868854522705, "rewards/DiagnosisAccuracyORM/mean": 0.5180555582046509, "rewards/DiagnosisAccuracyORM/std": 0.2981787621974945, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3611110746860504, "rewards/KeyDiagnosticEvidenceORM/std": 0.1530059576034546, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/mean_length": 813.8541870117188, "completions/min_length": 630.0, "entropy/max": 0.345703125, "entropy/mean": 0.201171875, "entropy/min": 0.129150390625, "epoch": 0.9634888438133874, "frac_reward_zero_std": 0.0, "grad_norm": 0.12145263701677322, "learning_rate": 3.353197219041981e-09, "loss": 0.01952715776860714, "reward": 1.9041088819503784, "reward_std": 0.2518158555030823, "rewards/DiagnosisAccuracyORM/mean": 0.6078124642372131, "rewards/DiagnosisAccuracyORM/std": 0.27800267934799194, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.29629629850387573, "rewards/KeyDiagnosticEvidenceORM/std": 0.08790332078933716, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/mean_length": 837.4166870117188, "completions/min_length": 690.0, "entropy/max": 0.373046875, "entropy/mean": 0.2197265625, "entropy/min": 0.16162109375, "epoch": 0.9655172413793104, "frac_reward_zero_std": 0.0, "grad_norm": 0.14637444913387299, "learning_rate": 2.9913307648797294e-09, "loss": 0.0023676257114857435, "reward": 1.790343999862671, "reward_std": 0.27263784408569336, "rewards/DiagnosisAccuracyORM/mean": 0.4332837164402008, "rewards/DiagnosisAccuracyORM/std": 0.29857271909713745, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35706019401550293, "rewards/KeyDiagnosticEvidenceORM/std": 0.10202017426490784, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/mean_length": 818.8541870117188, "completions/min_length": 668.0, "entropy/max": 0.3154296875, "entropy/mean": 0.21533203125, "entropy/min": 0.150390625, "epoch": 0.9675456389452333, "frac_reward_zero_std": 0.0, "grad_norm": 0.13102437555789948, "learning_rate": 2.650062192705471e-09, "loss": -0.010460738092660904, "reward": 1.9518849849700928, "reward_std": 0.24449093639850616, "rewards/DiagnosisAccuracyORM/mean": 0.6133432388305664, "rewards/DiagnosisAccuracyORM/std": 0.36505943536758423, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3385416567325592, "rewards/KeyDiagnosticEvidenceORM/std": 0.13049785792827606, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/mean_length": 813.2291870117188, "completions/min_length": 649.0, "entropy/max": 0.369140625, "entropy/mean": 0.205078125, "entropy/min": 0.158203125, "epoch": 0.9695740365111561, "frac_reward_zero_std": 0.0, "grad_norm": 0.10575313866138458, "learning_rate": 2.32940564595413e-09, "loss": -0.003468262730166316, "reward": 1.7716436386108398, "reward_std": 0.2452157437801361, "rewards/DiagnosisAccuracyORM/mean": 0.44236111640930176, "rewards/DiagnosisAccuracyORM/std": 0.3457605242729187, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3292824327945709, "rewards/KeyDiagnosticEvidenceORM/std": 0.15181995928287506, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/mean_length": 813.5833740234375, "completions/min_length": 703.0, "entropy/max": 0.3046875, "entropy/mean": 0.19775390625, "entropy/min": 0.1416015625, "epoch": 0.9716024340770791, "frac_reward_zero_std": 0.0, "grad_norm": 0.10822001099586487, "learning_rate": 2.029374413821949e-09, "loss": -0.0019408862572163343, "reward": 1.8000826835632324, "reward_std": 0.1466173231601715, "rewards/DiagnosisAccuracyORM/mean": 0.529365062713623, "rewards/DiagnosisAccuracyORM/std": 0.2635250985622406, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2707176208496094, "rewards/KeyDiagnosticEvidenceORM/std": 0.1099390909075737, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/mean_length": 805.375, "completions/min_length": 691.0, "entropy/max": 0.3876953125, "entropy/mean": 0.20654296875, "entropy/min": 0.1298828125, "epoch": 0.973630831643002, "frac_reward_zero_std": 0.0, "grad_norm": 0.15035660564899445, "learning_rate": 1.7499809307154889e-09, "loss": -0.00011911988258361816, "reward": 1.7073743343353271, "reward_std": 0.23031863570213318, "rewards/DiagnosisAccuracyORM/mean": 0.3995039761066437, "rewards/DiagnosisAccuracyORM/std": 0.27297502756118774, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.30787035822868347, "rewards/KeyDiagnosticEvidenceORM/std": 0.15606902539730072, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/mean_length": 808.1666870117188, "completions/min_length": 677.0, "entropy/max": 0.2822265625, "entropy/mean": 0.212890625, "entropy/min": 0.1591796875, "epoch": 0.9756592292089249, "frac_reward_zero_std": 0.0, "grad_norm": 0.12042324244976044, "learning_rate": 1.4912367757366485e-09, "loss": -0.028392067179083824, "reward": 1.826802372932434, "reward_std": 0.19709989428520203, "rewards/DiagnosisAccuracyORM/mean": 0.5523809790611267, "rewards/DiagnosisAccuracyORM/std": 0.2968478202819824, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2744213044643402, "rewards/KeyDiagnosticEvidenceORM/std": 0.146165132522583, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/mean_length": 811.5416870117188, "completions/min_length": 663.0, "entropy/max": 0.3486328125, "entropy/mean": 0.21826171875, "entropy/min": 0.138671875, "epoch": 0.9776876267748479, "frac_reward_zero_std": 0.0, "grad_norm": 0.1354949027299881, "learning_rate": 1.2531526722026065e-09, "loss": 0.0039505138993263245, "reward": 1.6391451358795166, "reward_std": 0.1735014021396637, "rewards/DiagnosisAccuracyORM/mean": 0.3416914641857147, "rewards/DiagnosisAccuracyORM/std": 0.2867167890071869, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2974536716938019, "rewards/KeyDiagnosticEvidenceORM/std": 0.1216835156083107, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/mean_length": 793.3958740234375, "completions/min_length": 698.0, "entropy/max": 0.255859375, "entropy/mean": 0.18701171875, "entropy/min": 0.1416015625, "epoch": 0.9797160243407708, "frac_reward_zero_std": 0.0, "grad_norm": 0.13117168843746185, "learning_rate": 1.0357384872011766e-09, "loss": -0.0046147508546710014, "reward": 1.79135262966156, "reward_std": 0.24719877541065216, "rewards/DiagnosisAccuracyORM/mean": 0.4886905252933502, "rewards/DiagnosisAccuracyORM/std": 0.2513650357723236, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3026620149612427, "rewards/KeyDiagnosticEvidenceORM/std": 0.16118238866329193, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/mean_length": 816.7916870117188, "completions/min_length": 714.0, "entropy/max": 0.396484375, "entropy/mean": 0.224609375, "entropy/min": 0.14453125, "epoch": 0.9817444219066938, "frac_reward_zero_std": 0.0, "grad_norm": 0.14091558754444122, "learning_rate": 8.390032311824114e-10, "loss": 0.004261404275894165, "reward": 1.7699404954910278, "reward_std": 0.2303544580936432, "rewards/DiagnosisAccuracyORM/mean": 0.44181549549102783, "rewards/DiagnosisAccuracyORM/std": 0.23553584516048431, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3281250298023224, "rewards/KeyDiagnosticEvidenceORM/std": 0.1485036462545395, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/mean_length": 807.1875, "completions/min_length": 699.0, "entropy/max": 0.283203125, "entropy/mean": 0.21484375, "entropy/min": 0.16845703125, "epoch": 0.9837728194726166, "frac_reward_zero_std": 0.0, "grad_norm": 0.12261134386062622, "learning_rate": 6.629550575847354e-10, "loss": -0.013191534206271172, "reward": 1.8531746864318848, "reward_std": 0.22889763116836548, "rewards/DiagnosisAccuracyORM/mean": 0.5233134627342224, "rewards/DiagnosisAccuracyORM/std": 0.22988055646419525, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3298611342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.11075963824987411, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/mean_length": 816.1666870117188, "completions/min_length": 712.0, "entropy/max": 0.2978515625, "entropy/mean": 0.2109375, "entropy/min": 0.1630859375, "epoch": 0.9858012170385395, "frac_reward_zero_std": 0.0, "grad_norm": 0.11889815330505371, "learning_rate": 5.076012624971593e-10, "loss": -0.017505068331956863, "reward": 1.9503142833709717, "reward_std": 0.22898446023464203, "rewards/DiagnosisAccuracyORM/mean": 0.6071428656578064, "rewards/DiagnosisAccuracyORM/std": 0.31483709812164307, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.34317126870155334, "rewards/KeyDiagnosticEvidenceORM/std": 0.10385426878929138, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/mean_length": 803.5208740234375, "completions/min_length": 689.0, "entropy/max": 0.3232421875, "entropy/mean": 0.2197265625, "entropy/min": 0.15185546875, "epoch": 0.9878296146044625, "frac_reward_zero_std": 0.0, "grad_norm": 0.13290338218212128, "learning_rate": 3.7294828435696644e-10, "loss": 0.0010830312967300415, "reward": 1.9644676446914673, "reward_std": 0.245771586894989, "rewards/DiagnosisAccuracyORM/mean": 0.574999988079071, "rewards/DiagnosisAccuracyORM/std": 0.2716946005821228, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3894675672054291, "rewards/KeyDiagnosticEvidenceORM/std": 0.14296495914459229, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/mean_length": 817.4791870117188, "completions/min_length": 678.0, "entropy/max": 0.412109375, "entropy/mean": 0.20654296875, "entropy/min": 0.12548828125, "epoch": 0.9898580121703854, "frac_reward_zero_std": 0.0, "grad_norm": 0.13227254152297974, "learning_rate": 2.5900170368281515e-10, "loss": -0.006192654371261597, "reward": 1.708449125289917, "reward_std": 0.1554088145494461, "rewards/DiagnosisAccuracyORM/mean": 0.3878472149372101, "rewards/DiagnosisAccuracyORM/std": 0.2643905580043793, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32060185074806213, "rewards/KeyDiagnosticEvidenceORM/std": 0.1140219047665596, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/mean_length": 814.9166870117188, "completions/min_length": 666.0, "entropy/max": 0.23974609375, "entropy/mean": 0.193359375, "entropy/min": 0.1513671875, "epoch": 0.9918864097363083, "frac_reward_zero_std": 0.0, "grad_norm": 0.1419660747051239, "learning_rate": 1.6576624284347917e-10, "loss": 0.0006971781840547919, "reward": 1.8434110879898071, "reward_std": 0.20030272006988525, "rewards/DiagnosisAccuracyORM/mean": 0.5233879089355469, "rewards/DiagnosisAccuracyORM/std": 0.3339756727218628, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.32002317905426025, "rewards/KeyDiagnosticEvidenceORM/std": 0.14336630702018738, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/mean_length": 830.8958740234375, "completions/min_length": 687.0, "entropy/max": 0.244140625, "entropy/mean": 0.18408203125, "entropy/min": 0.13623046875, "epoch": 0.9939148073022313, "frac_reward_zero_std": 0.0, "grad_norm": 0.12522664666175842, "learning_rate": 9.324576586211552e-11, "loss": -0.004237162880599499, "reward": 1.6679315567016602, "reward_std": 0.13819357752799988, "rewards/DiagnosisAccuracyORM/mean": 0.40057042241096497, "rewards/DiagnosisAccuracyORM/std": 0.3324569761753082, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.2673611342906952, "rewards/KeyDiagnosticEvidenceORM/std": 0.10896645486354828, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/mean_length": 803.1041870117188, "completions/min_length": 651.0, "entropy/max": 0.2880859375, "entropy/mean": 0.20263671875, "entropy/min": 0.14013671875, "epoch": 0.9959432048681541, "frac_reward_zero_std": 0.0, "grad_norm": 0.10719513893127441, "learning_rate": 4.144327825617022e-11, "loss": -0.0028557083569467068, "reward": 1.9437005519866943, "reward_std": 0.2250468134880066, "rewards/DiagnosisAccuracyORM/mean": 0.5891864895820618, "rewards/DiagnosisAccuracyORM/std": 0.3197058439254761, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.35451388359069824, "rewards/KeyDiagnosticEvidenceORM/std": 0.19412894546985626, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/mean_length": 825.4583740234375, "completions/min_length": 701.0, "entropy/max": 0.3408203125, "entropy/mean": 0.20458984375, "entropy/min": 0.127685546875, "epoch": 0.9979716024340771, "frac_reward_zero_std": 0.0, "grad_norm": 0.10682850331068039, "learning_rate": 1.0360926912866829e-11, "loss": 0.014303384348750114, "reward": 1.7630786895751953, "reward_std": 0.22181496024131775, "rewards/DiagnosisAccuracyORM/mean": 0.47430554032325745, "rewards/DiagnosisAccuracyORM/std": 0.3159652352333069, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.28877314925193787, "rewards/KeyDiagnosticEvidenceORM/std": 0.09270395338535309, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/mean_length": 807.6666870117188, "completions/min_length": 703.0, "entropy/max": 0.279296875, "entropy/mean": 0.19287109375, "entropy/min": 0.131103515625, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.12767314910888672, "learning_rate": 0.0, "loss": -0.0040830038487911224, "reward": 1.8710565567016602, "reward_std": 0.22905127704143524, "rewards/DiagnosisAccuracyORM/mean": 0.5568204522132874, "rewards/DiagnosisAccuracyORM/std": 0.3079776465892792, "rewards/FormatRewardORM/mean": 1.0, "rewards/FormatRewardORM/std": 0.0, "rewards/KeyDiagnosticEvidenceORM/mean": 0.3142361044883728, "rewards/KeyDiagnosticEvidenceORM/std": 0.13501162827014923, "step": 493 } ], "logging_steps": 1, "max_steps": 493, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }