{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4452359750667854, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.375, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 10.375, "completions/mean_terminated_length": 10.375, "completions/min_length": 3.5, "completions/min_terminated_length": 3.5, "epoch": 0.0008904719501335708, "frac_reward_zero_std": 0.75, "grad_norm": 3.243957996368408, "kl": 4.46301195025444, "learning_rate": 2.2222222222222224e-08, "loss": 0.08426375687122345, "num_tokens": 18262.0, "reward": 0.21250001154839993, "reward_std": 0.37748774141073227, "rewards/reward_financial_reasoning/mean": 0.21250001154839993, "rewards/reward_financial_reasoning/std": 0.37748774141073227, "step": 2, "step_time": 69.13269069449962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 28.5, "completions/max_terminated_length": 28.5, "completions/mean_length": 19.8125, "completions/mean_terminated_length": 19.8125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0017809439002671415, "frac_reward_zero_std": 0.5, "grad_norm": 170.77232360839844, "kl": 3.1398727893829346, "learning_rate": 6.666666666666668e-08, "loss": 0.12181135267019272, "num_tokens": 31739.0, "reward": 0.050000001676380634, "reward_std": 0.2702740728855133, "rewards/reward_financial_reasoning/mean": 0.050000001676380634, "rewards/reward_financial_reasoning/std": 0.2702740877866745, "step": 4, "step_time": 15.650146482497803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0026714158504007124, "frac_reward_zero_std": 1.0, "grad_norm": 1.4621564149856567, "kl": 4.303169131278992, "learning_rate": 1.1111111111111112e-07, "loss": 0.1490020453929901, "num_tokens": 48159.0, "reward": -0.07500000484287739, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": -0.07500000484287739, "rewards/reward_financial_reasoning/std": 0.18708287924528122, "step": 6, "step_time": 16.41489795200323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.125, "completions/clipped_ratio": 0.0, "completions/max_length": 9.5, "completions/max_terminated_length": 9.5, "completions/mean_length": 6.125, "completions/mean_terminated_length": 6.125, "completions/min_length": 3.5, "completions/min_terminated_length": 3.5, "epoch": 0.003561887800534283, "frac_reward_zero_std": 0.75, "grad_norm": 3.4477624893188477, "kl": 4.426726162433624, "learning_rate": 1.5555555555555556e-07, "loss": 0.1457902044057846, "num_tokens": 62193.0, "reward": 0.3125, "reward_std": 0.6411882638931274, "rewards/reward_financial_reasoning/mean": 0.3125, "rewards/reward_financial_reasoning/std": 0.6411882936954498, "step": 8, "step_time": 10.300071641000613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 14.8125, "completions/mean_terminated_length": 14.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.004452359750667854, "frac_reward_zero_std": 0.75, "grad_norm": 18.143714904785156, "kl": 8.868210554122925, "learning_rate": 2.0000000000000002e-07, "loss": 0.10136213153600693, "num_tokens": 79262.0, "reward": -0.15000000037252903, "reward_std": 0.19667484611272812, "rewards/reward_financial_reasoning/mean": -0.15000000037252903, "rewards/reward_financial_reasoning/std": 0.19667484611272812, "step": 10, "step_time": 15.87173518450254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 32.5, "completions/max_terminated_length": 32.5, "completions/mean_length": 16.3125, "completions/mean_terminated_length": 16.3125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.005342831700801425, "frac_reward_zero_std": 0.5, "grad_norm": 3.6053011417388916, "kl": 3.103500008583069, "learning_rate": 2.444444444444445e-07, "loss": 0.04430732876062393, "num_tokens": 96275.0, "reward": -0.049999999813735485, "reward_std": 0.23070836067199707, "rewards/reward_financial_reasoning/mean": -0.049999999813735485, "rewards/reward_financial_reasoning/std": 0.23070836067199707, "step": 12, "step_time": 18.03825327550112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.006233303650934996, "frac_reward_zero_std": 0.75, "grad_norm": 0.26595669984817505, "kl": 4.701120108366013, "learning_rate": 2.888888888888889e-07, "loss": 0.08473779261112213, "num_tokens": 105843.0, "reward": 0.48749998956918716, "reward_std": 0.2176603004336357, "rewards/reward_financial_reasoning/mean": 0.48749998956918716, "rewards/reward_financial_reasoning/std": 0.2176603153347969, "step": 14, "step_time": 26.971775608002645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 4.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.4375, "completions/mean_terminated_length": 4.4375, "completions/min_length": 1.5, "completions/min_terminated_length": 1.5, "epoch": 0.007123775601068566, "frac_reward_zero_std": 1.0, "grad_norm": 0.7614744901657104, "kl": 5.345696994656464, "learning_rate": 3.3333333333333335e-07, "loss": 0.19742266833782196, "num_tokens": 120834.0, "reward": 0.2750000059604645, "reward_std": 0.6681531071662903, "rewards/reward_financial_reasoning/mean": 0.2750000059604645, "rewards/reward_financial_reasoning/std": 0.6681531071662903, "step": 16, "step_time": 10.608633042498695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.008014247551202136, "frac_reward_zero_std": 1.0, "grad_norm": 1.597021460533142, "kl": 5.759613037109375, "learning_rate": 3.777777777777778e-07, "loss": 0.20153652131557465, "num_tokens": 143318.0, "reward": -0.375, "reward_std": 0.13363061845302582, "rewards/reward_financial_reasoning/mean": -0.375, "rewards/reward_financial_reasoning/std": 0.13363061845302582, "step": 18, "step_time": 18.688278918496508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.3125, "completions/clipped_ratio": 0.125, "completions/max_length": 136.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 44.3125, "completions/mean_terminated_length": 14.895833969116211, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.008904719501335707, "frac_reward_zero_std": 1.0, "grad_norm": 8.555985450744629, "kl": 6.408893913030624, "learning_rate": 4.2222222222222226e-07, "loss": 0.1307135373353958, "num_tokens": 164947.0, "reward": -0.12500000186264515, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.12500000186264515, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 20, "step_time": 48.735242737997396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 23.6875, "completions/mean_terminated_length": 23.6875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.009795191451469279, "frac_reward_zero_std": 0.75, "grad_norm": 2.6187081336975098, "kl": 3.0680589228868484, "learning_rate": 4.666666666666667e-07, "loss": -0.0021132836118340492, "num_tokens": 180622.0, "reward": -0.10000000149011612, "reward_std": 0.24984834343194962, "rewards/reward_financial_reasoning/mean": -0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.2498483583331108, "step": 22, "step_time": 19.969555340499937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.01068566340160285, "frac_reward_zero_std": 0.5, "grad_norm": 7.780721187591553, "kl": 3.334374338388443, "learning_rate": 5.111111111111112e-07, "loss": -0.036053549498319626, "num_tokens": 199364.0, "reward": 0.01250000111758709, "reward_std": 0.30308106541633606, "rewards/reward_financial_reasoning/mean": 0.01250000111758709, "rewards/reward_financial_reasoning/std": 0.30308108031749725, "step": 24, "step_time": 16.343573131500307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.375, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 21.375, "completions/mean_terminated_length": 21.375, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.01157613535173642, "frac_reward_zero_std": 0.75, "grad_norm": 15.003988265991211, "kl": 3.3079543113708496, "learning_rate": 5.555555555555555e-07, "loss": 0.024946369230747223, "num_tokens": 217602.0, "reward": 0.03749999962747097, "reward_std": 0.11877349019050598, "rewards/reward_financial_reasoning/mean": 0.03749999962747097, "rewards/reward_financial_reasoning/std": 0.11877349019050598, "step": 26, "step_time": 23.70613613250316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.375, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 15.375, "completions/mean_terminated_length": 15.375, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.012466607301869992, "frac_reward_zero_std": 0.75, "grad_norm": 8.009876251220703, "kl": 4.639135271310806, "learning_rate": 6.000000000000001e-07, "loss": 0.03613180294632912, "num_tokens": 236040.0, "reward": 0.25000000558793545, "reward_std": 0.3794081211090088, "rewards/reward_financial_reasoning/mean": 0.25000000558793545, "rewards/reward_financial_reasoning/std": 0.37940813601017, "step": 28, "step_time": 16.66078308500437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 7.0, "completions/min_length": 1.5, "completions/min_terminated_length": 1.5, "epoch": 0.013357079252003561, "frac_reward_zero_std": 1.0, "grad_norm": 2.2118546962738037, "kl": 6.419469177722931, "learning_rate": 6.444444444444445e-07, "loss": 0.22115381062030792, "num_tokens": 248568.0, "reward": 0.025000005960464478, "reward_std": 0.40089186280965805, "rewards/reward_financial_reasoning/mean": 0.025000005960464478, "rewards/reward_financial_reasoning/std": 0.40089186280965805, "step": 30, "step_time": 11.650034055499418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.014247551202137132, "frac_reward_zero_std": 1.0, "grad_norm": 1.1544721126556396, "kl": 3.581442207098007, "learning_rate": 6.88888888888889e-07, "loss": 0.13401712477207184, "num_tokens": 263136.0, "reward": -0.07500000484287739, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": -0.07500000484287739, "rewards/reward_financial_reasoning/std": 0.18708287924528122, "step": 32, "step_time": 13.175670107999395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 10.5, "completions/max_terminated_length": 10.5, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.015138023152270703, "frac_reward_zero_std": 1.0, "grad_norm": 0.14551043510437012, "kl": 4.411951839923859, "learning_rate": 7.333333333333334e-07, "loss": 0.17104172706604004, "num_tokens": 284920.0, "reward": -0.2749999985098839, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.2749999985098839, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 34, "step_time": 15.403023695998854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 63.1875, "completions/clipped_ratio": 0.1875, "completions/max_length": 140.5, "completions/max_terminated_length": 30.0, "completions/mean_length": 63.1875, "completions/mean_terminated_length": 20.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.016028495102404273, "frac_reward_zero_std": 1.0, "grad_norm": 0.961294949054718, "kl": 2.866768404841423, "learning_rate": 7.777777777777779e-07, "loss": 0.09989761561155319, "num_tokens": 300419.0, "reward": 0.12500000558793545, "reward_std": 0.34743960946798325, "rewards/reward_financial_reasoning/mean": 0.12500000558793545, "rewards/reward_financial_reasoning/std": 0.34743960946798325, "step": 36, "step_time": 45.11628793899581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.016918967052537846, "frac_reward_zero_std": 0.5, "grad_norm": 2.7937123775482178, "kl": 3.6248140931129456, "learning_rate": 8.222222222222223e-07, "loss": -0.07162602245807648, "num_tokens": 314263.0, "reward": -0.05000000074505806, "reward_std": 0.2777460217475891, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.2777460217475891, "step": 38, "step_time": 13.610250827498021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.125, "completions/clipped_ratio": 0.0625, "completions/max_length": 152.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 25.125, "completions/mean_terminated_length": 9.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.017809439002671415, "frac_reward_zero_std": 1.0, "grad_norm": 0.8069584369659424, "kl": 4.78283154964447, "learning_rate": 8.666666666666668e-07, "loss": 0.15486478805541992, "num_tokens": 336865.0, "reward": -0.2749999985098839, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.2749999985098839, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 40, "step_time": 54.72188140000435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.5, "completions/max_terminated_length": 32.5, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.018699910952804988, "frac_reward_zero_std": 0.75, "grad_norm": 4.385245323181152, "kl": 2.647599071264267, "learning_rate": 9.111111111111113e-07, "loss": 0.011303652077913284, "num_tokens": 356229.0, "reward": -0.03750000149011612, "reward_std": 0.1989518627524376, "rewards/reward_financial_reasoning/mean": -0.03750000149011612, "rewards/reward_financial_reasoning/std": 0.1989518627524376, "step": 42, "step_time": 19.585943939000572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 21.1875, "completions/mean_terminated_length": 21.1875, "completions/min_length": 1.5, "completions/min_terminated_length": 1.5, "epoch": 0.019590382902938557, "frac_reward_zero_std": 0.5, "grad_norm": 1.2599256038665771, "kl": 2.3717754259705544, "learning_rate": 9.555555555555556e-07, "loss": -0.02832583151757717, "num_tokens": 371504.0, "reward": -0.02500000037252903, "reward_std": 0.3867208957672119, "rewards/reward_financial_reasoning/mean": -0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.3867208957672119, "step": 44, "step_time": 26.471768608502316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 11.785714626312256, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.020480854853072127, "frac_reward_zero_std": 0.75, "grad_norm": 14.519295692443848, "kl": 3.7309726029634476, "learning_rate": 1.0000000000000002e-06, "loss": 0.22456848621368408, "num_tokens": 383456.0, "reward": 0.5375000089406967, "reward_std": 0.49749018251895905, "rewards/reward_financial_reasoning/mean": 0.5375000089406967, "rewards/reward_financial_reasoning/std": 0.49749018251895905, "step": 46, "step_time": 41.24796561349831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 54.1875, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 130.5, "completions/mean_length": 54.1875, "completions/mean_terminated_length": 22.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0213713268032057, "frac_reward_zero_std": 1.0, "grad_norm": 0.7381348013877869, "kl": 4.320437252521515, "learning_rate": 1.0444444444444445e-06, "loss": 0.11993282288312912, "num_tokens": 403243.0, "reward": -0.1999999973922968, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.1999999973922968, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 48, "step_time": 79.4801567964987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.625, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 13.625, "completions/mean_terminated_length": 13.625, "completions/min_length": 1.5, "completions/min_terminated_length": 1.5, "epoch": 0.02226179875333927, "frac_reward_zero_std": 0.75, "grad_norm": 0.1954340785741806, "kl": 2.8072616159915924, "learning_rate": 1.0888888888888889e-06, "loss": 0.022382210940122604, "num_tokens": 420445.0, "reward": -0.22500000335276127, "reward_std": 0.19232525676488876, "rewards/reward_financial_reasoning/mean": -0.22500000335276127, "rewards/reward_financial_reasoning/std": 0.19232525676488876, "step": 50, "step_time": 15.39180759699775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 19.0625, "completions/mean_terminated_length": 19.0625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.02315227070347284, "frac_reward_zero_std": 0.75, "grad_norm": 3.9910659790039062, "kl": 2.668681114912033, "learning_rate": 1.1333333333333334e-06, "loss": 0.11818550527095795, "num_tokens": 442006.0, "reward": -0.15000000223517418, "reward_std": 0.3017780929803848, "rewards/reward_financial_reasoning/mean": -0.15000000223517418, "rewards/reward_financial_reasoning/std": 0.301778107881546, "step": 52, "step_time": 18.692679949499507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.02404274265360641, "frac_reward_zero_std": 0.5, "grad_norm": 5.372073173522949, "kl": 3.6646086424589157, "learning_rate": 1.1777777777777778e-06, "loss": 0.02050051838159561, "num_tokens": 459962.0, "reward": -0.09999999683350325, "reward_std": 0.3260497897863388, "rewards/reward_financial_reasoning/mean": -0.09999999683350325, "rewards/reward_financial_reasoning/std": 0.3260497897863388, "step": 54, "step_time": 13.365985435504626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 18.3125, "completions/mean_terminated_length": 18.3125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.024933214603739984, "frac_reward_zero_std": 0.75, "grad_norm": 0.28867170214653015, "kl": 2.8480603992938995, "learning_rate": 1.2222222222222223e-06, "loss": 0.015568019822239876, "num_tokens": 479183.0, "reward": -0.07500000484287739, "reward_std": 0.24577751010656357, "rewards/reward_financial_reasoning/mean": -0.07500000484287739, "rewards/reward_financial_reasoning/std": 0.24577751755714417, "step": 56, "step_time": 16.855563032000646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 26.5, "completions/max_terminated_length": 26.5, "completions/mean_length": 15.0625, "completions/mean_terminated_length": 15.0625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.025823686553873553, "frac_reward_zero_std": 0.5, "grad_norm": 3.5727155208587646, "kl": 4.796411827206612, "learning_rate": 1.2666666666666669e-06, "loss": 0.06868542730808258, "num_tokens": 498696.0, "reward": -0.20000000298023224, "reward_std": 0.3437739461660385, "rewards/reward_financial_reasoning/mean": -0.20000000298023224, "rewards/reward_financial_reasoning/std": 0.3437739461660385, "step": 58, "step_time": 17.986308140998517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 8.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.026714158504007122, "frac_reward_zero_std": 1.0, "grad_norm": 0.5326482653617859, "kl": 4.87631031870842, "learning_rate": 1.3111111111111112e-06, "loss": 0.15867890417575836, "num_tokens": 520465.0, "reward": -0.2749999985098839, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.2749999985098839, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 60, "step_time": 15.652293133502098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.125, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 13.125, "completions/mean_terminated_length": 13.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.027604630454140695, "frac_reward_zero_std": 0.5, "grad_norm": 3.4932446479797363, "kl": 3.3685058057308197, "learning_rate": 1.3555555555555558e-06, "loss": -0.01082983985543251, "num_tokens": 541939.0, "reward": -0.04999999701976776, "reward_std": 0.44950494170188904, "rewards/reward_financial_reasoning/mean": -0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.44950494170188904, "step": 62, "step_time": 18.303554125999653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.125, "completions/clipped_ratio": 0.0, "completions/max_length": 36.5, "completions/max_terminated_length": 36.5, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.028495102404274265, "frac_reward_zero_std": 0.5, "grad_norm": 292.3175048828125, "kl": 30.459757924079895, "learning_rate": 1.4000000000000001e-06, "loss": 0.4946590065956116, "num_tokens": 560885.0, "reward": -0.06250000279396772, "reward_std": 0.24493902921676636, "rewards/reward_financial_reasoning/mean": -0.06250000279396772, "rewards/reward_financial_reasoning/std": 0.24493904411792755, "step": 64, "step_time": 19.929579762499998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 17.6875, "completions/mean_terminated_length": 17.6875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.029385574354407838, "frac_reward_zero_std": 0.75, "grad_norm": 0.331460565328598, "kl": 6.336628198623657, "learning_rate": 1.4444444444444445e-06, "loss": 0.2450381964445114, "num_tokens": 578064.0, "reward": -0.08750000037252903, "reward_std": 0.2354431226849556, "rewards/reward_financial_reasoning/mean": -0.08750000037252903, "rewards/reward_financial_reasoning/std": 0.2354431226849556, "step": 66, "step_time": 16.05476952400386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 14.3125, "completions/mean_terminated_length": 14.3125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.030276046304541407, "frac_reward_zero_std": 0.75, "grad_norm": 2.422355890274048, "kl": 1.9846842586994171, "learning_rate": 1.4888888888888888e-06, "loss": 0.10696038603782654, "num_tokens": 590245.0, "reward": 0.2500000074505806, "reward_std": 0.257793553173542, "rewards/reward_financial_reasoning/mean": 0.2500000074505806, "rewards/reward_financial_reasoning/std": 0.2577935680747032, "step": 68, "step_time": 12.55215245600084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 18.0625, "completions/mean_terminated_length": 18.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.031166518254674976, "frac_reward_zero_std": 0.75, "grad_norm": 0.34565091133117676, "kl": 3.1595654487609863, "learning_rate": 1.5333333333333334e-06, "loss": -0.003766145557165146, "num_tokens": 605742.0, "reward": -0.0625, "reward_std": 0.3245647996664047, "rewards/reward_financial_reasoning/mean": -0.0625, "rewards/reward_financial_reasoning/std": 0.3245648145675659, "step": 70, "step_time": 25.756368717500663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 9.1875, "completions/mean_terminated_length": 9.1875, "completions/min_length": 3.5, "completions/min_terminated_length": 3.5, "epoch": 0.032056990204808546, "frac_reward_zero_std": 0.75, "grad_norm": 8.074850082397461, "kl": 5.072204828262329, "learning_rate": 1.5777777777777778e-06, "loss": 0.19083915650844574, "num_tokens": 630049.0, "reward": -0.38750000298023224, "reward_std": 0.155264750123024, "rewards/reward_financial_reasoning/mean": -0.38750000298023224, "rewards/reward_financial_reasoning/std": 0.155264750123024, "step": 72, "step_time": 19.102319961501053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.125, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 19.125, "completions/mean_terminated_length": 19.125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.03294746215494212, "frac_reward_zero_std": 0.75, "grad_norm": 1.7271537780761719, "kl": 6.776509702205658, "learning_rate": 1.6222222222222223e-06, "loss": 0.2730098068714142, "num_tokens": 644963.0, "reward": 0.125, "reward_std": 0.4242233335971832, "rewards/reward_financial_reasoning/mean": 0.125, "rewards/reward_financial_reasoning/std": 0.4242233335971832, "step": 74, "step_time": 17.979279184004554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.1875, "completions/clipped_ratio": 0.0625, "completions/max_length": 156.0, "completions/max_terminated_length": 37.5, "completions/mean_length": 37.1875, "completions/mean_terminated_length": 22.23214340209961, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.03383793410507569, "frac_reward_zero_std": 1.0, "grad_norm": 0.6272057294845581, "kl": 2.333713859319687, "learning_rate": 1.6666666666666667e-06, "loss": 0.0896148756146431, "num_tokens": 658926.0, "reward": -0.12500000186264515, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.12500000186264515, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 76, "step_time": 48.02580772499823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 28.5, "completions/max_terminated_length": 28.5, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.034728406055209264, "frac_reward_zero_std": 0.5, "grad_norm": 3.010223865509033, "kl": 5.109567791223526, "learning_rate": 1.7111111111111112e-06, "loss": -0.013501618057489395, "num_tokens": 676228.0, "reward": 0.10000000894069672, "reward_std": 0.45257411897182465, "rewards/reward_financial_reasoning/mean": 0.10000000894069672, "rewards/reward_financial_reasoning/std": 0.45257411897182465, "step": 78, "step_time": 16.771125109999048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 14.0625, "completions/mean_terminated_length": 14.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.03561887800534283, "frac_reward_zero_std": 0.5, "grad_norm": 5.468640327453613, "kl": 3.081940993666649, "learning_rate": 1.7555555555555556e-06, "loss": -0.07647820562124252, "num_tokens": 698397.0, "reward": -0.049999999813735485, "reward_std": 0.2613307610154152, "rewards/reward_financial_reasoning/mean": -0.049999999813735485, "rewards/reward_financial_reasoning/std": 0.2613307684659958, "step": 80, "step_time": 19.49620248700012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.125, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 15.125, "completions/mean_terminated_length": 15.125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0365093499554764, "frac_reward_zero_std": 0.5, "grad_norm": 3.437584161758423, "kl": 3.713206648826599, "learning_rate": 1.8000000000000001e-06, "loss": 0.10175147652626038, "num_tokens": 719007.0, "reward": -0.049999999813735485, "reward_std": 0.2613307684659958, "rewards/reward_financial_reasoning/mean": -0.049999999813735485, "rewards/reward_financial_reasoning/std": 0.2613307684659958, "step": 82, "step_time": 17.08468607800205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.625, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 21.625, "completions/mean_terminated_length": 21.625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.037399821905609976, "frac_reward_zero_std": 0.75, "grad_norm": 0.4997353255748749, "kl": 3.0508928298950195, "learning_rate": 1.8444444444444445e-06, "loss": 0.11351507902145386, "num_tokens": 736329.0, "reward": 0.06250000093132257, "reward_std": 0.1060660183429718, "rewards/reward_financial_reasoning/mean": 0.06250000093132257, "rewards/reward_financial_reasoning/std": 0.1060660183429718, "step": 84, "step_time": 16.044017669501045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.125, "completions/clipped_ratio": 0.0625, "completions/max_length": 145.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 26.125, "completions/mean_terminated_length": 10.562500238418579, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.03829029385574354, "frac_reward_zero_std": 0.5, "grad_norm": 2.8706929683685303, "kl": 2.500796929001808, "learning_rate": 1.888888888888889e-06, "loss": -0.08194264769554138, "num_tokens": 749475.0, "reward": 0.1250000074505806, "reward_std": 0.37987764179706573, "rewards/reward_financial_reasoning/mean": 0.1250000074505806, "rewards/reward_financial_reasoning/std": 0.3798776715993881, "step": 86, "step_time": 43.53679231749993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.6875, "completions/clipped_ratio": 0.0625, "completions/max_length": 141.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 34.6875, "completions/mean_terminated_length": 19.85714340209961, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.039180765805877114, "frac_reward_zero_std": 0.75, "grad_norm": 10.299760818481445, "kl": 3.631163567304611, "learning_rate": 1.9333333333333336e-06, "loss": 0.09746833145618439, "num_tokens": 773742.0, "reward": -0.10000000521540642, "reward_std": 0.09258200973272324, "rewards/reward_financial_reasoning/mean": -0.10000000521540642, "rewards/reward_financial_reasoning/std": 0.09258200973272324, "step": 88, "step_time": 51.070965140001135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.875, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 76.875, "completions/mean_terminated_length": 18.742857933044434, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.04007123775601069, "frac_reward_zero_std": 0.75, "grad_norm": 1.2004384994506836, "kl": 4.518661990761757, "learning_rate": 1.977777777777778e-06, "loss": 0.21323581039905548, "num_tokens": 793972.0, "reward": -0.03749999403953552, "reward_std": 0.285500705242157, "rewards/reward_financial_reasoning/mean": -0.03749999403953552, "rewards/reward_financial_reasoning/std": 0.28550073504447937, "step": 90, "step_time": 77.64364999449936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 9.0625, "completions/mean_terminated_length": 9.0625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.04096170970614425, "frac_reward_zero_std": 1.0, "grad_norm": 15.11466121673584, "kl": 4.976789236068726, "learning_rate": 2.0222222222222223e-06, "loss": 0.16917964816093445, "num_tokens": 806405.0, "reward": 0.6749999970197678, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": 0.6749999970197678, "rewards/reward_financial_reasoning/std": 0.24053513258695602, "step": 92, "step_time": 11.1175422624965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 13.1875, "completions/mean_terminated_length": 13.1875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.041852181656277826, "frac_reward_zero_std": 1.0, "grad_norm": 11.747920989990234, "kl": 5.53629332780838, "learning_rate": 2.0666666666666666e-06, "loss": 0.21535387635231018, "num_tokens": 819672.0, "reward": 0.2500000037252903, "reward_std": 0.32071349024772644, "rewards/reward_financial_reasoning/mean": 0.2500000037252903, "rewards/reward_financial_reasoning/std": 0.32071349024772644, "step": 94, "step_time": 15.913111352499982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.25, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 17.25, "completions/mean_terminated_length": 17.25, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.0427426536064114, "frac_reward_zero_std": 1.0, "grad_norm": 0.10250017791986465, "kl": 2.3222732543945312, "learning_rate": 2.1111111111111114e-06, "loss": 0.09148351848125458, "num_tokens": 837228.0, "reward": 0.45000001788139343, "reward_std": 0.37416574358940125, "rewards/reward_financial_reasoning/mean": 0.45000001788139343, "rewards/reward_financial_reasoning/std": 0.37416577339172363, "step": 96, "step_time": 14.932274136997876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 60.8125, "completions/clipped_ratio": 0.1875, "completions/max_length": 134.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 60.8125, "completions/mean_terminated_length": 17.6875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.04363312555654497, "frac_reward_zero_std": 0.75, "grad_norm": 0.2879371643066406, "kl": 2.264972969889641, "learning_rate": 2.1555555555555558e-06, "loss": 0.09448125958442688, "num_tokens": 852985.0, "reward": -0.0625, "reward_std": 0.3245647996664047, "rewards/reward_financial_reasoning/mean": -0.0625, "rewards/reward_financial_reasoning/std": 0.3245648145675659, "step": 98, "step_time": 42.98987148800006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 21.0625, "completions/mean_terminated_length": 21.0625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.04452359750667854, "frac_reward_zero_std": 1.0, "grad_norm": 0.05980491638183594, "kl": 2.593031495809555, "learning_rate": 2.2e-06, "loss": 0.10243881493806839, "num_tokens": 866602.0, "reward": -0.15000000223517418, "reward_std": 0.05345224589109421, "rewards/reward_financial_reasoning/mean": -0.15000000223517418, "rewards/reward_financial_reasoning/std": 0.05345224589109421, "step": 100, "step_time": 13.76024783450157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.625, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 14.625, "completions/mean_terminated_length": 14.625, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.04541406945681211, "frac_reward_zero_std": 0.75, "grad_norm": 4.663613319396973, "kl": 3.8922190964221954, "learning_rate": 2.2444444444444445e-06, "loss": 0.06969564408063889, "num_tokens": 881948.0, "reward": 0.3124999888241291, "reward_std": 0.30207616090774536, "rewards/reward_financial_reasoning/mean": 0.3124999888241291, "rewards/reward_financial_reasoning/std": 0.302076131105423, "step": 102, "step_time": 13.194114739499128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 38.5, "completions/max_terminated_length": 38.5, "completions/mean_length": 22.1875, "completions/mean_terminated_length": 22.1875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.04630454140694568, "frac_reward_zero_std": 1.0, "grad_norm": 0.29489243030548096, "kl": 2.4103888869285583, "learning_rate": 2.2888888888888892e-06, "loss": 0.09568312764167786, "num_tokens": 901039.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 104, "step_time": 20.464529470000343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.1875, "completions/clipped_ratio": 0.0625, "completions/max_length": 176.5, "completions/max_terminated_length": 60.5, "completions/mean_length": 34.1875, "completions/mean_terminated_length": 19.3125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.04719501335707925, "frac_reward_zero_std": 0.75, "grad_norm": 1.2691603899002075, "kl": 2.7120189517736435, "learning_rate": 2.3333333333333336e-06, "loss": 0.07602652907371521, "num_tokens": 922626.0, "reward": -0.05000000074505806, "reward_std": 0.21905138343572617, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.21905138343572617, "step": 106, "step_time": 58.071344395997585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.25, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 18.25, "completions/mean_terminated_length": 18.25, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.04808548530721282, "frac_reward_zero_std": 0.5, "grad_norm": 6.271705627441406, "kl": 3.232674241065979, "learning_rate": 2.377777777777778e-06, "loss": 0.07806958258152008, "num_tokens": 934790.0, "reward": 0.050000011920928955, "reward_std": 0.42308470606803894, "rewards/reward_financial_reasoning/mean": 0.050000011920928955, "rewards/reward_financial_reasoning/std": 0.4230847507715225, "step": 108, "step_time": 16.047973144000935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 24.3125, "completions/mean_terminated_length": 24.3125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.048975957257346395, "frac_reward_zero_std": 1.0, "grad_norm": 0.09767260402441025, "kl": 2.3331067264080048, "learning_rate": 2.4222222222222223e-06, "loss": 0.09566066414117813, "num_tokens": 950203.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 110, "step_time": 24.890910685500785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.6875, "completions/clipped_ratio": 0.0625, "completions/max_length": 135.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 35.6875, "completions/mean_terminated_length": 21.598215103149414, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.04986642920747997, "frac_reward_zero_std": 0.75, "grad_norm": 0.7795641422271729, "kl": 7.069983154535294, "learning_rate": 2.466666666666667e-06, "loss": 0.3206178545951843, "num_tokens": 972878.0, "reward": -0.01249999925494194, "reward_std": 0.12464234232902527, "rewards/reward_financial_reasoning/mean": -0.01249999925494194, "rewards/reward_financial_reasoning/std": 0.12464234232902527, "step": 112, "step_time": 47.53451731649693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.875, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 24.875, "completions/mean_terminated_length": 24.875, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.05075690115761353, "frac_reward_zero_std": 0.5, "grad_norm": 1.755477786064148, "kl": 2.44833767414093, "learning_rate": 2.5111111111111114e-06, "loss": -0.027675483375787735, "num_tokens": 991140.0, "reward": 0.08750000596046448, "reward_std": 0.3419739603996277, "rewards/reward_financial_reasoning/mean": 0.08750000596046448, "rewards/reward_financial_reasoning/std": 0.34197400510311127, "step": 114, "step_time": 18.244106624995766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.9375, "completions/mean_terminated_length": 14.9375, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.051647373107747106, "frac_reward_zero_std": 0.75, "grad_norm": 1.7609986066818237, "kl": 3.860931694507599, "learning_rate": 2.5555555555555557e-06, "loss": 0.11580531299114227, "num_tokens": 1007059.0, "reward": 0.04999999701976776, "reward_std": 0.3033005967736244, "rewards/reward_financial_reasoning/mean": 0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.3033006191253662, "step": 116, "step_time": 14.473379719500372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.375, "completions/clipped_ratio": 0.0, "completions/max_length": 35.5, "completions/max_terminated_length": 35.5, "completions/mean_length": 21.375, "completions/mean_terminated_length": 21.375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.05253784505788068, "frac_reward_zero_std": 1.0, "grad_norm": 0.042947202920913696, "kl": 5.62386429309845, "learning_rate": 2.6e-06, "loss": 0.17796431481838226, "num_tokens": 1027377.0, "reward": -0.07500000484287739, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": -0.07500000484287739, "rewards/reward_financial_reasoning/std": 0.18708287924528122, "step": 118, "step_time": 20.568094111997198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.625, "completions/clipped_ratio": 0.0, "completions/max_length": 69.5, "completions/max_terminated_length": 69.5, "completions/mean_length": 31.625, "completions/mean_terminated_length": 31.625, "completions/min_length": 2.5, "completions/min_terminated_length": 2.5, "epoch": 0.053428317008014245, "frac_reward_zero_std": 0.75, "grad_norm": 0.8792534470558167, "kl": 4.0221880078315735, "learning_rate": 2.6444444444444444e-06, "loss": 0.06174422800540924, "num_tokens": 1047523.0, "reward": -0.1374999973922968, "reward_std": 0.3001621291041374, "rewards/reward_financial_reasoning/mean": -0.1374999973922968, "rewards/reward_financial_reasoning/std": 0.3001621440052986, "step": 120, "step_time": 29.076913421999052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 21.6875, "completions/mean_terminated_length": 21.6875, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.05431878895814782, "frac_reward_zero_std": 0.75, "grad_norm": 0.11915815621614456, "kl": 2.785850167274475, "learning_rate": 2.6888888888888892e-06, "loss": 0.12607991695404053, "num_tokens": 1065086.0, "reward": -0.025000005960464478, "reward_std": 0.1776151806116104, "rewards/reward_financial_reasoning/mean": -0.025000005960464478, "rewards/reward_financial_reasoning/std": 0.177615188062191, "step": 122, "step_time": 15.821294531000603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.6875, "completions/clipped_ratio": 0.0625, "completions/max_length": 134.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 24.6875, "completions/mean_terminated_length": 9.151785850524902, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.05520926090828139, "frac_reward_zero_std": 1.0, "grad_norm": 3.4671032428741455, "kl": 3.9387396574020386, "learning_rate": 2.7333333333333336e-06, "loss": 0.13445711135864258, "num_tokens": 1082913.0, "reward": 0.050000011920928955, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.050000011920928955, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 124, "step_time": 44.41438667900002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 17.5625, "completions/mean_terminated_length": 17.5625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.05609973285841496, "frac_reward_zero_std": 1.0, "grad_norm": 0.21380269527435303, "kl": 2.8844070732593536, "learning_rate": 2.7777777777777783e-06, "loss": 0.10589596629142761, "num_tokens": 1103354.0, "reward": 0.1250000037252903, "reward_std": 0.34743958711624146, "rewards/reward_financial_reasoning/mean": 0.1250000037252903, "rewards/reward_financial_reasoning/std": 0.34743958711624146, "step": 126, "step_time": 16.954522954500135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 137.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 9.357142925262451, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.05699020480854853, "frac_reward_zero_std": 1.0, "grad_norm": 5.337584972381592, "kl": 5.580496460199356, "learning_rate": 2.8222222222222223e-06, "loss": 0.20137156546115875, "num_tokens": 1120778.0, "reward": -0.04999999701976776, "reward_std": 0.37416573613882065, "rewards/reward_financial_reasoning/mean": -0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.37416573613882065, "step": 128, "step_time": 45.34434924549896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0578806767586821, "frac_reward_zero_std": 1.0, "grad_norm": 3.981386661529541, "kl": 3.401599198579788, "learning_rate": 2.866666666666667e-06, "loss": 0.12407220900058746, "num_tokens": 1141394.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 130, "step_time": 16.4087845009999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 10.9375, "completions/mean_terminated_length": 10.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.058771148708815675, "frac_reward_zero_std": 1.0, "grad_norm": 0.20108631253242493, "kl": 3.4849415719509125, "learning_rate": 2.9111111111111114e-06, "loss": 0.12001919746398926, "num_tokens": 1154313.0, "reward": 0.3499999865889549, "reward_std": 0.21380899101495743, "rewards/reward_financial_reasoning/mean": 0.3499999865889549, "rewards/reward_financial_reasoning/std": 0.21380899101495743, "step": 132, "step_time": 12.112006235996887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 12.0625, "completions/mean_terminated_length": 12.0625, "completions/min_length": 3.5, "completions/min_terminated_length": 3.5, "epoch": 0.05966162065894924, "frac_reward_zero_std": 0.75, "grad_norm": 9.684659957885742, "kl": 3.3992087990045547, "learning_rate": 2.955555555555556e-06, "loss": 0.1022736206650734, "num_tokens": 1171394.0, "reward": 0.1374999936670065, "reward_std": 0.4541053995490074, "rewards/reward_financial_reasoning/mean": 0.1374999936670065, "rewards/reward_financial_reasoning/std": 0.4541054293513298, "step": 134, "step_time": 14.642255357997783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.060552092609082814, "frac_reward_zero_std": 0.75, "grad_norm": 0.3956330716609955, "kl": 4.586791276931763, "learning_rate": 3e-06, "loss": 0.12092146277427673, "num_tokens": 1186376.0, "reward": 0.21250000968575478, "reward_std": 0.20310094952583313, "rewards/reward_financial_reasoning/mean": 0.21250000968575478, "rewards/reward_financial_reasoning/std": 0.20310097932815552, "step": 136, "step_time": 11.559512399000596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.75, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.75, "completions/mean_terminated_length": 10.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.06144256455921639, "frac_reward_zero_std": 0.75, "grad_norm": 2.993830680847168, "kl": 3.829523801803589, "learning_rate": 3.044444444444445e-06, "loss": 0.13624754548072815, "num_tokens": 1201172.0, "reward": -0.0624999962747097, "reward_std": 0.2199837565422058, "rewards/reward_financial_reasoning/mean": -0.0624999962747097, "rewards/reward_financial_reasoning/std": 0.219983771443367, "step": 138, "step_time": 12.256231981002202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.375, "completions/clipped_ratio": 0.0625, "completions/max_length": 149.0, "completions/max_terminated_length": 34.5, "completions/mean_length": 35.375, "completions/mean_terminated_length": 20.723215103149414, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.06233303650934995, "frac_reward_zero_std": 0.75, "grad_norm": 7.678333282470703, "kl": 3.021816372871399, "learning_rate": 3.088888888888889e-06, "loss": 0.032898858189582825, "num_tokens": 1223026.0, "reward": -0.08750000037252903, "reward_std": 0.20482071489095688, "rewards/reward_financial_reasoning/mean": -0.08750000037252903, "rewards/reward_financial_reasoning/std": 0.20482071489095688, "step": 140, "step_time": 50.93402997600242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.06322350845948353, "frac_reward_zero_std": 1.0, "grad_norm": 0.1813589483499527, "kl": 4.437779903411865, "learning_rate": 3.133333333333334e-06, "loss": 0.1627044826745987, "num_tokens": 1245666.0, "reward": -0.30000000447034836, "reward_std": 0.21380899101495743, "rewards/reward_financial_reasoning/mean": -0.30000000447034836, "rewards/reward_financial_reasoning/std": 0.21380899101495743, "step": 142, "step_time": 18.050760761499987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.75, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.75, "completions/mean_terminated_length": 14.75, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.06411398040961709, "frac_reward_zero_std": 1.0, "grad_norm": 0.23129655420780182, "kl": 2.1533931493759155, "learning_rate": 3.177777777777778e-06, "loss": 0.08247792720794678, "num_tokens": 1258398.0, "reward": 0.45000001788139343, "reward_std": 0.37416574358940125, "rewards/reward_financial_reasoning/mean": 0.45000001788139343, "rewards/reward_financial_reasoning/std": 0.37416577339172363, "step": 144, "step_time": 11.140301175000786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.125, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 15.125, "completions/mean_terminated_length": 15.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.06500445235975066, "frac_reward_zero_std": 0.5, "grad_norm": 4.715083599090576, "kl": 2.318701334297657, "learning_rate": 3.2222222222222227e-06, "loss": -0.08997282385826111, "num_tokens": 1279640.0, "reward": 0.07500000298023224, "reward_std": 0.3218744471669197, "rewards/reward_financial_reasoning/mean": 0.07500000298023224, "rewards/reward_financial_reasoning/std": 0.3218744695186615, "step": 146, "step_time": 17.162366078498962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.06589492430988424, "frac_reward_zero_std": 0.75, "grad_norm": 0.4836774468421936, "kl": 3.3755117654800415, "learning_rate": 3.266666666666667e-06, "loss": 0.18510375916957855, "num_tokens": 1296112.0, "reward": 0.07500000111758709, "reward_std": 0.276574470102787, "rewards/reward_financial_reasoning/mean": 0.07500000111758709, "rewards/reward_financial_reasoning/std": 0.2765744850039482, "step": 148, "step_time": 13.026451161000296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.5625, "completions/mean_terminated_length": 18.5625, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.06678539626001781, "frac_reward_zero_std": 1.0, "grad_norm": 0.11278322339057922, "kl": 2.8200909793376923, "learning_rate": 3.3111111111111118e-06, "loss": 0.11123226583003998, "num_tokens": 1311249.0, "reward": -0.15000000223517418, "reward_std": 0.05345224589109421, "rewards/reward_financial_reasoning/mean": -0.15000000223517418, "rewards/reward_financial_reasoning/std": 0.05345224589109421, "step": 150, "step_time": 14.589614674499899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 132.5, "completions/max_terminated_length": 10.5, "completions/mean_length": 24.0, "completions/mean_terminated_length": 8.633928775787354, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.06767586821015138, "frac_reward_zero_std": 0.75, "grad_norm": 11.057661056518555, "kl": 25.710087820887566, "learning_rate": 3.3555555555555557e-06, "loss": 1.1084628105163574, "num_tokens": 1330345.0, "reward": -0.012499988079071045, "reward_std": 0.28327932208776474, "rewards/reward_financial_reasoning/mean": -0.012499988079071045, "rewards/reward_financial_reasoning/std": 0.2832793518900871, "step": 152, "step_time": 44.78167107950139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.375, "completions/clipped_ratio": 0.0, "completions/max_length": 39.5, "completions/max_terminated_length": 39.5, "completions/mean_length": 28.375, "completions/mean_terminated_length": 28.375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.06856634016028496, "frac_reward_zero_std": 1.0, "grad_norm": 0.4117250144481659, "kl": 2.3949645459651947, "learning_rate": 3.4000000000000005e-06, "loss": 0.09245544672012329, "num_tokens": 1352247.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 154, "step_time": 22.419895262502905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.625, "completions/clipped_ratio": 0.0, "completions/max_length": 34.5, "completions/max_terminated_length": 34.5, "completions/mean_length": 19.625, "completions/mean_terminated_length": 19.625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.06945681211041853, "frac_reward_zero_std": 1.0, "grad_norm": 2.388998031616211, "kl": 4.125469475984573, "learning_rate": 3.444444444444445e-06, "loss": 0.14830118417739868, "num_tokens": 1368657.0, "reward": 0.3999999836087227, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": 0.3999999836087227, "rewards/reward_financial_reasoning/std": 0.16035675257444382, "step": 156, "step_time": 17.970027042503716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.625, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 21.625, "completions/mean_terminated_length": 21.625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.07034728406055209, "frac_reward_zero_std": 0.75, "grad_norm": 2.199248790740967, "kl": 2.5577471256256104, "learning_rate": 3.4888888888888896e-06, "loss": 0.031034421175718307, "num_tokens": 1392611.0, "reward": 0.06250000093132257, "reward_std": 0.1060660183429718, "rewards/reward_financial_reasoning/mean": 0.06250000093132257, "rewards/reward_financial_reasoning/std": 0.1060660183429718, "step": 158, "step_time": 19.871817894503693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.0625, "completions/mean_terminated_length": 21.0625, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.07123775601068566, "frac_reward_zero_std": 0.75, "grad_norm": 2.693187713623047, "kl": 2.236900717020035, "learning_rate": 3.5333333333333335e-06, "loss": 0.10101586580276489, "num_tokens": 1416028.0, "reward": 0.02500000037252903, "reward_std": 0.13887301087379456, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.13887301087379456, "step": 160, "step_time": 20.560153357999297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 30.5, "completions/max_terminated_length": 30.5, "completions/mean_length": 22.9375, "completions/mean_terminated_length": 22.9375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.07212822796081923, "frac_reward_zero_std": 0.75, "grad_norm": 0.08777616918087006, "kl": 2.303772583603859, "learning_rate": 3.577777777777778e-06, "loss": 0.09048432111740112, "num_tokens": 1435923.0, "reward": -0.11250000633299351, "reward_std": 0.1989518702030182, "rewards/reward_financial_reasoning/mean": -0.11250000633299351, "rewards/reward_financial_reasoning/std": 0.1989518627524376, "step": 162, "step_time": 18.901748009000585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.0730186999109528, "frac_reward_zero_std": 0.75, "grad_norm": 2.7375295162200928, "kl": 3.2199981957674026, "learning_rate": 3.6222222222222226e-06, "loss": 0.10302453488111496, "num_tokens": 1454867.0, "reward": 0.26250001043081284, "reward_std": 0.3512909263372421, "rewards/reward_financial_reasoning/mean": 0.26250001043081284, "rewards/reward_financial_reasoning/std": 0.3512909561395645, "step": 164, "step_time": 16.01710794649989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 29.5, "completions/max_terminated_length": 29.5, "completions/mean_length": 13.6875, "completions/mean_terminated_length": 13.6875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.07390917186108638, "frac_reward_zero_std": 1.0, "grad_norm": 0.24207016825675964, "kl": 4.016330122947693, "learning_rate": 3.6666666666666666e-06, "loss": 0.15451282262802124, "num_tokens": 1474742.0, "reward": -0.04999999701976776, "reward_std": 0.37416573613882065, "rewards/reward_financial_reasoning/mean": -0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.37416573613882065, "step": 166, "step_time": 18.76729526700001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.125, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.125, "completions/mean_terminated_length": 11.125, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.07479964381121995, "frac_reward_zero_std": 1.0, "grad_norm": 0.42672666907310486, "kl": 4.4028559923172, "learning_rate": 3.7111111111111113e-06, "loss": 0.17786993086338043, "num_tokens": 1492792.0, "reward": 0.02499999850988388, "reward_std": 0.45434408634901047, "rewards/reward_financial_reasoning/mean": 0.02499999850988388, "rewards/reward_financial_reasoning/std": 0.45434409379959106, "step": 168, "step_time": 14.194298194499424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 14.6875, "completions/mean_terminated_length": 14.6875, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.07569011576135352, "frac_reward_zero_std": 1.0, "grad_norm": 1.46550452709198, "kl": 3.6853742003440857, "learning_rate": 3.7555555555555557e-06, "loss": 0.14712777733802795, "num_tokens": 1510931.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 170, "step_time": 15.002850790498997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.07658058771148708, "frac_reward_zero_std": 0.5, "grad_norm": 1.6111212968826294, "kl": 3.136908233165741, "learning_rate": 3.8000000000000005e-06, "loss": 0.1122005432844162, "num_tokens": 1533875.0, "reward": 0.037499998696148396, "reward_std": 0.1767766922712326, "rewards/reward_financial_reasoning/mean": 0.037499998696148396, "rewards/reward_financial_reasoning/std": 0.1767766997218132, "step": 172, "step_time": 18.050798523498088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.875, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.875, "completions/mean_terminated_length": 14.875, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.07747105966162066, "frac_reward_zero_std": 0.75, "grad_norm": 0.12939698994159698, "kl": 2.4790256321430206, "learning_rate": 3.844444444444445e-06, "loss": 0.020719770342111588, "num_tokens": 1548265.0, "reward": 0.21250000968575478, "reward_std": 0.20310094952583313, "rewards/reward_financial_reasoning/mean": 0.21250000968575478, "rewards/reward_financial_reasoning/std": 0.20310097932815552, "step": 174, "step_time": 12.47596741849884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 13.9375, "completions/mean_terminated_length": 13.9375, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.07836153161175423, "frac_reward_zero_std": 0.75, "grad_norm": 5.401264190673828, "kl": 5.6927982568740845, "learning_rate": 3.88888888888889e-06, "loss": 0.09270425140857697, "num_tokens": 1564544.0, "reward": 0.125, "reward_std": 0.3060004860162735, "rewards/reward_financial_reasoning/mean": 0.125, "rewards/reward_financial_reasoning/std": 0.3060004934668541, "step": 176, "step_time": 15.803215666997858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 26.5, "completions/max_terminated_length": 26.5, "completions/mean_length": 18.6875, "completions/mean_terminated_length": 18.6875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0792520035618878, "frac_reward_zero_std": 1.0, "grad_norm": 0.2923208177089691, "kl": 3.5329647958278656, "learning_rate": 3.9333333333333335e-06, "loss": 0.14282937347888947, "num_tokens": 1585955.0, "reward": -0.07499999925494194, "reward_std": 0.026726126670837402, "rewards/reward_financial_reasoning/mean": -0.07499999925494194, "rewards/reward_financial_reasoning/std": 0.026726126670837402, "step": 178, "step_time": 18.7747930500027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 19.9375, "completions/mean_terminated_length": 19.9375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.08014247551202137, "frac_reward_zero_std": 1.0, "grad_norm": 0.2639215886592865, "kl": 6.156628400087357, "learning_rate": 3.977777777777778e-06, "loss": 0.19962944090366364, "num_tokens": 1610266.0, "reward": -0.32500000298023224, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": -0.32500000298023224, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 180, "step_time": 21.797623717498936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.625, "completions/clipped_ratio": 0.0625, "completions/max_length": 133.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 24.625, "completions/mean_terminated_length": 9.339285850524902, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.08103294746215495, "frac_reward_zero_std": 0.75, "grad_norm": 1.0663926601409912, "kl": 3.6094931960105896, "learning_rate": 4.022222222222222e-06, "loss": 0.20474201440811157, "num_tokens": 1627628.0, "reward": 0.08750000596046448, "reward_std": 0.6723021864891052, "rewards/reward_financial_reasoning/mean": 0.08750000596046448, "rewards/reward_financial_reasoning/std": 0.6723021864891052, "step": 182, "step_time": 44.85433911799737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.0819234194122885, "frac_reward_zero_std": 1.0, "grad_norm": 0.2525313198566437, "kl": 3.37558776140213, "learning_rate": 4.066666666666667e-06, "loss": 0.13418884575366974, "num_tokens": 1645004.0, "reward": 0.025000005960464478, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": 0.025000005960464478, "rewards/reward_financial_reasoning/std": 0.24053513258695602, "step": 184, "step_time": 14.621046053998725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.08281389136242208, "frac_reward_zero_std": 0.5, "grad_norm": 4.167638301849365, "kl": 3.0642621517181396, "learning_rate": 4.111111111111111e-06, "loss": 0.0730942115187645, "num_tokens": 1665756.0, "reward": -0.15000000409781933, "reward_std": 0.27403824776411057, "rewards/reward_financial_reasoning/mean": -0.15000000409781933, "rewards/reward_financial_reasoning/std": 0.27403824031352997, "step": 186, "step_time": 18.20900296650325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.0, "completions/mean_terminated_length": 18.0, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.08370436331255565, "frac_reward_zero_std": 1.0, "grad_norm": 0.09201586991548538, "kl": 2.442968338727951, "learning_rate": 4.155555555555556e-06, "loss": 0.09587804228067398, "num_tokens": 1684332.0, "reward": -0.07500000484287739, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": -0.07500000484287739, "rewards/reward_financial_reasoning/std": 0.18708287924528122, "step": 188, "step_time": 15.8890612949981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.0625, "completions/clipped_ratio": 0.0625, "completions/max_length": 148.0, "completions/max_terminated_length": 54.5, "completions/mean_length": 45.0625, "completions/mean_terminated_length": 30.642857551574707, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.08459483526268922, "frac_reward_zero_std": 0.75, "grad_norm": 0.757686972618103, "kl": 2.118753992021084, "learning_rate": 4.2000000000000004e-06, "loss": 0.025539696216583252, "num_tokens": 1703053.0, "reward": -0.012500002980232239, "reward_std": 0.2711125537753105, "rewards/reward_financial_reasoning/mean": -0.012500002980232239, "rewards/reward_financial_reasoning/std": 0.2711125761270523, "step": 190, "step_time": 48.18112432499947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.375, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.375, "completions/mean_terminated_length": 16.375, "completions/min_length": 3.5, "completions/min_terminated_length": 3.5, "epoch": 0.0854853072128228, "frac_reward_zero_std": 1.0, "grad_norm": 2.921569585800171, "kl": 3.9216238781809807, "learning_rate": 4.244444444444445e-06, "loss": 0.12688420712947845, "num_tokens": 1716339.0, "reward": 0.1750000026077032, "reward_std": 0.40089186280965805, "rewards/reward_financial_reasoning/mean": 0.1750000026077032, "rewards/reward_financial_reasoning/std": 0.40089186280965805, "step": 192, "step_time": 22.157812262499647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.375, "completions/clipped_ratio": 0.0, "completions/max_length": 47.5, "completions/max_terminated_length": 47.5, "completions/mean_length": 35.375, "completions/mean_terminated_length": 35.375, "completions/min_length": 24.5, "completions/min_terminated_length": 24.5, "epoch": 0.08637577916295637, "frac_reward_zero_std": 0.75, "grad_norm": 0.06356346607208252, "kl": 2.7308596670627594, "learning_rate": 4.288888888888889e-06, "loss": 0.12605991959571838, "num_tokens": 1736721.0, "reward": -0.17500000447034836, "reward_std": 0.1776151806116104, "rewards/reward_financial_reasoning/mean": -0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.177615188062191, "step": 194, "step_time": 23.11299426900041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.875, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 19.875, "completions/mean_terminated_length": 19.875, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.08726625111308994, "frac_reward_zero_std": 0.75, "grad_norm": 2.439833164215088, "kl": 2.893902435898781, "learning_rate": 4.333333333333334e-06, "loss": 0.14680534601211548, "num_tokens": 1752303.0, "reward": 0.04999999701976776, "reward_std": 0.2905927076935768, "rewards/reward_financial_reasoning/mean": 0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.290592722594738, "step": 196, "step_time": 16.636957890499616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 48.5, "completions/max_terminated_length": 48.5, "completions/mean_length": 23.0625, "completions/mean_terminated_length": 23.0625, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.0881567230632235, "frac_reward_zero_std": 0.75, "grad_norm": 4.632042407989502, "kl": 2.5856750905513763, "learning_rate": 4.377777777777778e-06, "loss": 0.02410995587706566, "num_tokens": 1768720.0, "reward": 0.11250000447034836, "reward_std": 0.31000544875860214, "rewards/reward_financial_reasoning/mean": 0.11250000447034836, "rewards/reward_financial_reasoning/std": 0.3100054860115051, "step": 198, "step_time": 21.464168812499338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 13.9375, "completions/mean_terminated_length": 13.9375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.08904719501335707, "frac_reward_zero_std": 0.75, "grad_norm": 2.4800167083740234, "kl": 3.6862347424030304, "learning_rate": 4.422222222222223e-06, "loss": 0.1973346471786499, "num_tokens": 1789447.0, "reward": -0.08749999850988388, "reward_std": 0.485219344496727, "rewards/reward_financial_reasoning/mean": -0.08749999850988388, "rewards/reward_financial_reasoning/std": 0.485219344496727, "step": 200, "step_time": 22.986546036001528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 14.4375, "completions/mean_terminated_length": 14.4375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.08993766696349065, "frac_reward_zero_std": 1.0, "grad_norm": 0.7083855271339417, "kl": 2.8951866924762726, "learning_rate": 4.4666666666666665e-06, "loss": 0.12021103501319885, "num_tokens": 1807526.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 202, "step_time": 14.811557051994896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 14.9375, "completions/mean_terminated_length": 14.9375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.09082813891362422, "frac_reward_zero_std": 0.5, "grad_norm": 4.18487024307251, "kl": 4.287087470293045, "learning_rate": 4.511111111111111e-06, "loss": 0.10975737869739532, "num_tokens": 1825821.0, "reward": -0.012499988079071045, "reward_std": 0.3419739603996277, "rewards/reward_financial_reasoning/mean": -0.012499988079071045, "rewards/reward_financial_reasoning/std": 0.3419739902019501, "step": 204, "step_time": 16.611893776500438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 143.0, "completions/max_terminated_length": 29.5, "completions/mean_length": 36.4375, "completions/mean_terminated_length": 21.785715103149414, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.0917186108637578, "frac_reward_zero_std": 0.75, "grad_norm": 5.862529277801514, "kl": 2.1467432379722595, "learning_rate": 4.555555555555556e-06, "loss": 0.1539805829524994, "num_tokens": 1846972.0, "reward": 0.11250000447034836, "reward_std": 0.31000544875860214, "rewards/reward_financial_reasoning/mean": 0.11250000447034836, "rewards/reward_financial_reasoning/std": 0.3100054860115051, "step": 206, "step_time": 50.025916307500665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 32.5, "completions/max_terminated_length": 32.5, "completions/mean_length": 16.5625, "completions/mean_terminated_length": 16.5625, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.09260908281389137, "frac_reward_zero_std": 1.0, "grad_norm": 0.3114466369152069, "kl": 2.7216466069221497, "learning_rate": 4.600000000000001e-06, "loss": 0.10927754640579224, "num_tokens": 1859013.0, "reward": 0.07500000298023224, "reward_std": 0.34743961691856384, "rewards/reward_financial_reasoning/mean": 0.07500000298023224, "rewards/reward_financial_reasoning/std": 0.34743961691856384, "step": 208, "step_time": 14.936293552002098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 29.5, "completions/max_terminated_length": 29.5, "completions/mean_length": 19.8125, "completions/mean_terminated_length": 19.8125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.09349955476402494, "frac_reward_zero_std": 1.0, "grad_norm": 0.16725488007068634, "kl": 2.662395626306534, "learning_rate": 4.644444444444445e-06, "loss": 0.11168936640024185, "num_tokens": 1880250.0, "reward": -0.15000000223517418, "reward_std": 0.05345224589109421, "rewards/reward_financial_reasoning/mean": -0.15000000223517418, "rewards/reward_financial_reasoning/std": 0.05345224589109421, "step": 210, "step_time": 19.786764974996913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.0943900267141585, "frac_reward_zero_std": 1.0, "grad_norm": 0.07584002614021301, "kl": 2.111318424344063, "learning_rate": 4.6888888888888895e-06, "loss": 0.08701398968696594, "num_tokens": 1897586.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 212, "step_time": 15.371527886505646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.125, "completions/clipped_ratio": 0.0, "completions/max_length": 47.5, "completions/max_terminated_length": 47.5, "completions/mean_length": 22.125, "completions/mean_terminated_length": 22.125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.09528049866429207, "frac_reward_zero_std": 0.75, "grad_norm": 13.01284408569336, "kl": 4.123442500829697, "learning_rate": 4.7333333333333335e-06, "loss": 0.21884416043758392, "num_tokens": 1918428.0, "reward": -0.03750000149011612, "reward_std": 0.4317670986056328, "rewards/reward_financial_reasoning/mean": -0.03750000149011612, "rewards/reward_financial_reasoning/std": 0.4317671060562134, "step": 214, "step_time": 24.531285858996853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.375, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 12.375, "completions/mean_terminated_length": 12.375, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.09617097061442564, "frac_reward_zero_std": 1.0, "grad_norm": 0.17621411383152008, "kl": 2.0488149896264076, "learning_rate": 4.777777777777778e-06, "loss": 0.08796633780002594, "num_tokens": 1929210.0, "reward": 0.42500001192092896, "reward_std": 0.5077963620424271, "rewards/reward_financial_reasoning/mean": 0.42500001192092896, "rewards/reward_financial_reasoning/std": 0.5077963769435883, "step": 216, "step_time": 12.308896507001919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.125, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.125, "completions/mean_terminated_length": 11.125, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.09706144256455922, "frac_reward_zero_std": 1.0, "grad_norm": 0.2214796096086502, "kl": 3.1335892230272293, "learning_rate": 4.822222222222222e-06, "loss": 0.11833228915929794, "num_tokens": 1949284.0, "reward": 0.1250000074505806, "reward_std": 0.34743961691856384, "rewards/reward_financial_reasoning/mean": 0.1250000074505806, "rewards/reward_financial_reasoning/std": 0.34743963181972504, "step": 218, "step_time": 15.425380380998831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.4375, "completions/mean_terminated_length": 18.4375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.09795191451469279, "frac_reward_zero_std": 0.5, "grad_norm": 2.749892234802246, "kl": 3.3727396726608276, "learning_rate": 4.866666666666667e-06, "loss": 0.13246247172355652, "num_tokens": 1964643.0, "reward": 0.050000001676380634, "reward_std": 0.2702740728855133, "rewards/reward_financial_reasoning/mean": 0.050000001676380634, "rewards/reward_financial_reasoning/std": 0.2702740877866745, "step": 220, "step_time": 15.254870478507655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 87.5, "completions/max_terminated_length": 87.5, "completions/mean_length": 20.4375, "completions/mean_terminated_length": 20.4375, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.09884238646482636, "frac_reward_zero_std": 1.0, "grad_norm": 86.07991790771484, "kl": 5.883344158530235, "learning_rate": 4.911111111111112e-06, "loss": 0.24957503378391266, "num_tokens": 1976706.0, "reward": 0.25000000558793545, "reward_std": 0.32071348279714584, "rewards/reward_financial_reasoning/mean": 0.25000000558793545, "rewards/reward_financial_reasoning/std": 0.32071349769830704, "step": 222, "step_time": 29.745959900501475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.09973285841495994, "frac_reward_zero_std": 1.0, "grad_norm": 0.13994386792182922, "kl": 3.634816884994507, "learning_rate": 4.9555555555555565e-06, "loss": 0.14822566509246826, "num_tokens": 1990182.0, "reward": 0.050000011920928955, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.050000011920928955, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 224, "step_time": 14.035459309507132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.625, "completions/clipped_ratio": 0.0625, "completions/max_length": 132.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 27.625, "completions/mean_terminated_length": 12.758929252624512, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.1006233303650935, "frac_reward_zero_std": 0.75, "grad_norm": 8.492895126342773, "kl": 3.6512217223644257, "learning_rate": 5e-06, "loss": 0.10606303811073303, "num_tokens": 2012120.0, "reward": -0.14999999850988388, "reward_std": 0.4457136541604996, "rewards/reward_financial_reasoning/mean": -0.14999999850988388, "rewards/reward_financial_reasoning/std": 0.4457136541604996, "step": 226, "step_time": 48.76606522999282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.625, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 9.625, "completions/mean_terminated_length": 9.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.10151380231522707, "frac_reward_zero_std": 0.75, "grad_norm": 2.535083293914795, "kl": 2.9311847388744354, "learning_rate": 4.995051954477982e-06, "loss": 0.1752665936946869, "num_tokens": 2024234.0, "reward": 0.17500000447034836, "reward_std": 0.3918117731809616, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.3918117731809616, "step": 228, "step_time": 11.424557798505703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.875, "completions/clipped_ratio": 0.0625, "completions/max_length": 144.0, "completions/max_terminated_length": 20.5, "completions/mean_length": 31.875, "completions/mean_terminated_length": 16.25000023841858, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.10240427426536064, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962670236825943, "kl": 2.9220656007528305, "learning_rate": 4.990103908955963e-06, "loss": 0.10569591820240021, "num_tokens": 2039264.0, "reward": 0.3999999836087227, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": 0.3999999836087227, "rewards/reward_financial_reasoning/std": 0.16035675257444382, "step": 230, "step_time": 45.636118210997665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.9375, "completions/mean_terminated_length": 13.9375, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.10329474621549421, "frac_reward_zero_std": 0.75, "grad_norm": 0.06443369388580322, "kl": 4.253673791885376, "learning_rate": 4.985155863433944e-06, "loss": 0.19769635796546936, "num_tokens": 2050935.0, "reward": 0.41249997168779373, "reward_std": 0.1586594134569168, "rewards/reward_financial_reasoning/mean": 0.41249997168779373, "rewards/reward_financial_reasoning/std": 0.1586594209074974, "step": 232, "step_time": 11.086136656002054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 18.5625, "completions/mean_terminated_length": 18.5625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.10418521816562779, "frac_reward_zero_std": 1.0, "grad_norm": 0.1366126388311386, "kl": 2.861827075481415, "learning_rate": 4.980207817911925e-06, "loss": 0.1160162016749382, "num_tokens": 2071048.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 234, "step_time": 17.117513173492625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 12.9375, "completions/mean_terminated_length": 12.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.10507569011576136, "frac_reward_zero_std": 1.0, "grad_norm": 5.253388404846191, "kl": 4.99092635512352, "learning_rate": 4.975259772389907e-06, "loss": 0.1833035945892334, "num_tokens": 2086695.0, "reward": -0.375, "reward_std": 0.13363061845302582, "rewards/reward_financial_reasoning/mean": -0.375, "rewards/reward_financial_reasoning/std": 0.13363061845302582, "step": 236, "step_time": 15.18828914400001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.10596616206589493, "frac_reward_zero_std": 1.0, "grad_norm": 0.08144163340330124, "kl": 2.5794699490070343, "learning_rate": 4.970311726867888e-06, "loss": 0.10143584758043289, "num_tokens": 2102087.0, "reward": -0.10000000894069672, "reward_std": 0.21380899846553802, "rewards/reward_financial_reasoning/mean": -0.10000000894069672, "rewards/reward_financial_reasoning/std": 0.21380901336669922, "step": 238, "step_time": 19.704446994994214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.10685663401602849, "frac_reward_zero_std": 0.75, "grad_norm": 3.362489700317383, "kl": 5.458074390888214, "learning_rate": 4.965363681345869e-06, "loss": 0.24738503992557526, "num_tokens": 2119563.0, "reward": 0.04999999701976776, "reward_std": 0.3974972069263458, "rewards/reward_financial_reasoning/mean": 0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.3974972069263458, "step": 240, "step_time": 24.470748322499276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 61.3125, "completions/clipped_ratio": 0.1875, "completions/max_length": 138.0, "completions/max_terminated_length": 22.5, "completions/mean_length": 61.3125, "completions/mean_terminated_length": 17.100000381469727, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.10774710596616206, "frac_reward_zero_std": 0.75, "grad_norm": 6.715212821960449, "kl": 3.6618697345256805, "learning_rate": 4.96041563582385e-06, "loss": 0.25621670484542847, "num_tokens": 2137576.0, "reward": 0.26250001043081284, "reward_std": 0.3512909263372421, "rewards/reward_financial_reasoning/mean": 0.26250001043081284, "rewards/reward_financial_reasoning/std": 0.3512909561395645, "step": 242, "step_time": 48.974390029008646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.25, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 30.25, "completions/mean_terminated_length": 30.25, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.10863757791629564, "frac_reward_zero_std": 1.0, "grad_norm": 0.29011860489845276, "kl": 2.333177834749222, "learning_rate": 4.9554675903018315e-06, "loss": 0.09401866793632507, "num_tokens": 2157516.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 244, "step_time": 24.383899069496692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.5, "completions/max_terminated_length": 27.5, "completions/mean_length": 10.25, "completions/mean_terminated_length": 10.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.10952804986642921, "frac_reward_zero_std": 0.75, "grad_norm": 0.16274474561214447, "kl": 4.739854276180267, "learning_rate": 4.9505195447798124e-06, "loss": 0.2652374505996704, "num_tokens": 2175464.0, "reward": 0.22499998658895493, "reward_std": 0.4262783080339432, "rewards/reward_financial_reasoning/mean": 0.22499998658895493, "rewards/reward_financial_reasoning/std": 0.42627833783626556, "step": 246, "step_time": 17.72774281000602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.75, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 18.75, "completions/mean_terminated_length": 18.75, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.11041852181656278, "frac_reward_zero_std": 1.0, "grad_norm": 0.10518508404493332, "kl": 3.172191321849823, "learning_rate": 4.945571499257793e-06, "loss": 0.12077988684177399, "num_tokens": 2194132.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 248, "step_time": 17.46086399950218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5625, "completions/mean_terminated_length": 11.5625, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.11130899376669635, "frac_reward_zero_std": 0.75, "grad_norm": 0.09775284677743912, "kl": 4.232122749090195, "learning_rate": 4.940623453735775e-06, "loss": 0.16927936673164368, "num_tokens": 2210133.0, "reward": 0.36250001937150955, "reward_std": 0.14225983619689941, "rewards/reward_financial_reasoning/mean": 0.36250001937150955, "rewards/reward_financial_reasoning/std": 0.14225984364748, "step": 250, "step_time": 12.163132073997986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.625, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.625, "completions/mean_terminated_length": 11.625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.11219946571682991, "frac_reward_zero_std": 0.75, "grad_norm": 0.34086915850639343, "kl": 2.9683316498994827, "learning_rate": 4.935675408213756e-06, "loss": 0.1359396129846573, "num_tokens": 2226015.0, "reward": 0.23750000447034836, "reward_std": 0.3371334373950958, "rewards/reward_financial_reasoning/mean": 0.23750000447034836, "rewards/reward_financial_reasoning/std": 0.3371334373950958, "step": 252, "step_time": 13.12046697099504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 21.1875, "completions/mean_terminated_length": 21.1875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.11308993766696349, "frac_reward_zero_std": 0.75, "grad_norm": 0.16412538290023804, "kl": 3.1274770498275757, "learning_rate": 4.930727362691737e-06, "loss": 0.05577349662780762, "num_tokens": 2243394.0, "reward": 0.04999999701976776, "reward_std": 0.3033005967736244, "rewards/reward_financial_reasoning/mean": 0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.3033006191253662, "step": 254, "step_time": 15.995422225503717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.375, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 12.375, "completions/mean_terminated_length": 12.375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.11398040961709706, "frac_reward_zero_std": 0.25, "grad_norm": 7.9987897872924805, "kl": 4.347164452075958, "learning_rate": 4.925779317169718e-06, "loss": 0.1008201465010643, "num_tokens": 2264448.0, "reward": 9.313225746154785e-10, "reward_std": 0.3648405969142914, "rewards/reward_financial_reasoning/mean": 9.313225746154785e-10, "rewards/reward_financial_reasoning/std": 0.3648405969142914, "step": 256, "step_time": 15.81367241350381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.11487088156723063, "frac_reward_zero_std": 1.0, "grad_norm": 0.1827375590801239, "kl": 3.112269103527069, "learning_rate": 4.9208312716477e-06, "loss": 0.11426741629838943, "num_tokens": 2281640.0, "reward": 0.45000001788139343, "reward_std": 0.37416574358940125, "rewards/reward_financial_reasoning/mean": 0.45000001788139343, "rewards/reward_financial_reasoning/std": 0.37416577339172363, "step": 258, "step_time": 15.266290853498504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5, "completions/clipped_ratio": 0.0, "completions/max_length": 28.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 21.5, "completions/mean_terminated_length": 21.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1157613535173642, "frac_reward_zero_std": 1.0, "grad_norm": 0.06703684478998184, "kl": 2.67875400185585, "learning_rate": 4.915883226125681e-06, "loss": 0.10653968900442123, "num_tokens": 2302760.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 260, "step_time": 19.27089970898669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 34.0625, "completions/mean_terminated_length": 34.0625, "completions/min_length": 26.5, "completions/min_terminated_length": 26.5, "epoch": 0.11665182546749778, "frac_reward_zero_std": 1.0, "grad_norm": 0.5481160283088684, "kl": 2.3839191049337387, "learning_rate": 4.910935180603662e-06, "loss": 0.092856764793396, "num_tokens": 2320257.0, "reward": -0.10000000521540642, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": -0.10000000521540642, "rewards/reward_financial_reasoning/std": 0.0, "step": 262, "step_time": 20.353683264493156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.875, "completions/clipped_ratio": 0.0, "completions/max_length": 12.5, "completions/max_terminated_length": 12.5, "completions/mean_length": 10.875, "completions/mean_terminated_length": 10.875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.11754229741763135, "frac_reward_zero_std": 1.0, "grad_norm": 0.27290061116218567, "kl": 2.046361565589905, "learning_rate": 4.905987135081643e-06, "loss": 0.08147154003381729, "num_tokens": 2337095.0, "reward": 0.1250000037252903, "reward_std": 0.34743958711624146, "rewards/reward_financial_reasoning/mean": 0.1250000037252903, "rewards/reward_financial_reasoning/std": 0.34743958711624146, "step": 264, "step_time": 12.594754040492262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.625, "completions/clipped_ratio": 0.0625, "completions/max_length": 142.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 31.625, "completions/mean_terminated_length": 16.10714292526245, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.11843276936776491, "frac_reward_zero_std": 0.75, "grad_norm": 0.6876330375671387, "kl": 4.0588459968566895, "learning_rate": 4.9010390895596245e-06, "loss": 0.24528250098228455, "num_tokens": 2349969.0, "reward": 0.30000001937150955, "reward_std": 0.2920685186982155, "rewards/reward_financial_reasoning/mean": 0.30000001937150955, "rewards/reward_financial_reasoning/std": 0.2920685261487961, "step": 266, "step_time": 43.22905554099998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.11932324131789848, "frac_reward_zero_std": 1.0, "grad_norm": 0.10317867994308472, "kl": 3.0537564754486084, "learning_rate": 4.8960910440376054e-06, "loss": 0.12132774293422699, "num_tokens": 2372401.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 268, "step_time": 17.077831523005443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 9.9375, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.12021371326803205, "frac_reward_zero_std": 1.0, "grad_norm": 0.3369079530239105, "kl": 5.676760792732239, "learning_rate": 4.891142998515586e-06, "loss": 0.20450036227703094, "num_tokens": 2382808.0, "reward": 0.3999999985098839, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": 0.3999999985098839, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 270, "step_time": 23.16856124699916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.25, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 16.25, "completions/mean_terminated_length": 16.25, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.12110418521816563, "frac_reward_zero_std": 0.75, "grad_norm": 0.8960656523704529, "kl": 3.670739457011223, "learning_rate": 4.886194952993568e-06, "loss": 0.1782083660364151, "num_tokens": 2395636.0, "reward": 0.10000001639127731, "reward_std": 0.38396354019641876, "rewards/reward_financial_reasoning/mean": 0.10000001639127731, "rewards/reward_financial_reasoning/std": 0.38396355509757996, "step": 272, "step_time": 13.109228974997677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.4375, "completions/mean_terminated_length": 15.4375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1219946571682992, "frac_reward_zero_std": 1.0, "grad_norm": 0.08199235051870346, "kl": 2.68540820479393, "learning_rate": 4.881246907471549e-06, "loss": 0.10612474381923676, "num_tokens": 2413979.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 274, "step_time": 15.015723587002867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 50.5, "completions/max_terminated_length": 50.5, "completions/mean_length": 32.0625, "completions/mean_terminated_length": 32.0625, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.12288512911843277, "frac_reward_zero_std": 0.75, "grad_norm": 0.7122228145599365, "kl": 1.6769566163420677, "learning_rate": 4.87629886194953e-06, "loss": -0.005341395735740662, "num_tokens": 2432404.0, "reward": 0.15000000223517418, "reward_std": 0.1963960975408554, "rewards/reward_financial_reasoning/mean": 0.15000000223517418, "rewards/reward_financial_reasoning/std": 0.1963961124420166, "step": 276, "step_time": 23.557659125493956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5, "completions/clipped_ratio": 0.0, "completions/max_length": 41.5, "completions/max_terminated_length": 41.5, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.12377560106856635, "frac_reward_zero_std": 0.75, "grad_norm": 0.11030659079551697, "kl": 3.0595133006572723, "learning_rate": 4.871350816427511e-06, "loss": 0.08042588829994202, "num_tokens": 2451300.0, "reward": -0.08749999850988388, "reward_std": 0.485219344496727, "rewards/reward_financial_reasoning/mean": -0.08749999850988388, "rewards/reward_financial_reasoning/std": 0.485219344496727, "step": 278, "step_time": 21.63308912049979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.125, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 26.125, "completions/mean_terminated_length": 26.125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1246660730186999, "frac_reward_zero_std": 0.75, "grad_norm": 0.08378394693136215, "kl": 2.6196550726890564, "learning_rate": 4.866402770905493e-06, "loss": 0.21984213590621948, "num_tokens": 2475614.0, "reward": 0.38750001788139343, "reward_std": 0.39018382132053375, "rewards/reward_financial_reasoning/mean": 0.38750001788139343, "rewards/reward_financial_reasoning/std": 0.39018386602401733, "step": 280, "step_time": 41.759238430495316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.625, "completions/clipped_ratio": 0.0625, "completions/max_length": 144.0, "completions/max_terminated_length": 19.5, "completions/mean_length": 32.625, "completions/mean_terminated_length": 17.00000023841858, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.12555654496883348, "frac_reward_zero_std": 1.0, "grad_norm": 0.7665231823921204, "kl": 2.6322261691093445, "learning_rate": 4.861454725383474e-06, "loss": 0.0889662653207779, "num_tokens": 2491152.0, "reward": 0.2500000037252903, "reward_std": 0.32071349024772644, "rewards/reward_financial_reasoning/mean": 0.2500000037252903, "rewards/reward_financial_reasoning/std": 0.32071349024772644, "step": 282, "step_time": 45.58885034900595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 15.4375, "completions/mean_terminated_length": 15.4375, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.12644701691896706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0762035995721817, "kl": 4.372367188334465, "learning_rate": 4.8565066798614556e-06, "loss": 0.17135773599147797, "num_tokens": 2502879.0, "reward": 0.6749999970197678, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": 0.6749999970197678, "rewards/reward_financial_reasoning/std": 0.24053513258695602, "step": 284, "step_time": 15.359563550005987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.375, "completions/clipped_ratio": 0.0, "completions/max_length": 10.5, "completions/max_terminated_length": 10.5, "completions/mean_length": 7.375, "completions/mean_terminated_length": 7.375, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.12733748886910062, "frac_reward_zero_std": 1.0, "grad_norm": 0.6486235857009888, "kl": 4.935705095529556, "learning_rate": 4.851558634339436e-06, "loss": 0.1994798481464386, "num_tokens": 2516885.0, "reward": -0.125, "reward_std": 0.40089187026023865, "rewards/reward_financial_reasoning/mean": -0.125, "rewards/reward_financial_reasoning/std": 0.40089187026023865, "step": 286, "step_time": 10.359898515005625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.125, "completions/clipped_ratio": 0.0, "completions/max_length": 39.5, "completions/max_terminated_length": 39.5, "completions/mean_length": 25.125, "completions/mean_terminated_length": 25.125, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.12822796081923418, "frac_reward_zero_std": 1.0, "grad_norm": 0.27073779702186584, "kl": 3.24738085269928, "learning_rate": 4.8466105888174175e-06, "loss": 0.11370493471622467, "num_tokens": 2532095.0, "reward": 0.10000000149011612, "reward_std": 0.37416573613882065, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.37416574358940125, "step": 288, "step_time": 28.717111858499266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.12911843276936777, "frac_reward_zero_std": 0.75, "grad_norm": 5.002157688140869, "kl": 3.048767626285553, "learning_rate": 4.841662543295399e-06, "loss": 0.11240536719560623, "num_tokens": 2553159.0, "reward": -0.04999999329447746, "reward_std": 0.25354626774787903, "rewards/reward_financial_reasoning/mean": -0.04999999329447746, "rewards/reward_financial_reasoning/std": 0.2535462975502014, "step": 290, "step_time": 15.560649553492112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 27.5, "completions/max_terminated_length": 27.5, "completions/mean_length": 16.9375, "completions/mean_terminated_length": 16.9375, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.13000890471950133, "frac_reward_zero_std": 0.75, "grad_norm": 4.702836036682129, "kl": 3.7099992632865906, "learning_rate": 4.83671449777338e-06, "loss": 0.12139731645584106, "num_tokens": 2571374.0, "reward": 0.4750000238418579, "reward_std": 0.4641419053077698, "rewards/reward_financial_reasoning/mean": 0.4750000238418579, "rewards/reward_financial_reasoning/std": 0.46414193511009216, "step": 292, "step_time": 17.602761367503263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 7.0625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.13089937666963491, "frac_reward_zero_std": 1.0, "grad_norm": 24.566186904907227, "kl": 3.7748285830020905, "learning_rate": 4.831766452251361e-06, "loss": 0.1423129141330719, "num_tokens": 2591519.0, "reward": -0.125, "reward_std": 0.40089187026023865, "rewards/reward_financial_reasoning/mean": -0.125, "rewards/reward_financial_reasoning/std": 0.40089187026023865, "step": 294, "step_time": 14.158596545497858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.0625, "completions/clipped_ratio": 0.0625, "completions/max_length": 141.5, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.0625, "completions/mean_terminated_length": 15.830357551574707, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.13178984861976847, "frac_reward_zero_std": 0.75, "grad_norm": 1.2435134649276733, "kl": 4.378018736839294, "learning_rate": 4.826818406729342e-06, "loss": 0.15879350900650024, "num_tokens": 2606584.0, "reward": 0.16249998658895493, "reward_std": 0.4624329060316086, "rewards/reward_financial_reasoning/mean": 0.16249998658895493, "rewards/reward_financial_reasoning/std": 0.4624328762292862, "step": 296, "step_time": 44.203227273505036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.625, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.13268032056990206, "frac_reward_zero_std": 0.75, "grad_norm": 0.1087723895907402, "kl": 2.2376338690519333, "learning_rate": 4.821870361207324e-06, "loss": -0.017229370772838593, "num_tokens": 2623290.0, "reward": 0.21250000968575478, "reward_std": 0.20310094952583313, "rewards/reward_financial_reasoning/mean": 0.21250000968575478, "rewards/reward_financial_reasoning/std": 0.20310097932815552, "step": 298, "step_time": 15.0541494709978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.625, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 13.625, "completions/mean_terminated_length": 13.625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.13357079252003562, "frac_reward_zero_std": 1.0, "grad_norm": 0.38564178347587585, "kl": 3.935918390750885, "learning_rate": 4.816922315685305e-06, "loss": 0.13589729368686676, "num_tokens": 2642180.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 300, "step_time": 16.082533718003106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.125, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 29.125, "completions/mean_terminated_length": 14.178571701049805, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.13446126447016918, "frac_reward_zero_std": 0.75, "grad_norm": 0.15317149460315704, "kl": 2.9735162407159805, "learning_rate": 4.811974270163286e-06, "loss": 0.20121431350708008, "num_tokens": 2659278.0, "reward": 0.38750001788139343, "reward_std": 0.39018382132053375, "rewards/reward_financial_reasoning/mean": 0.38750001788139343, "rewards/reward_financial_reasoning/std": 0.39018386602401733, "step": 302, "step_time": 45.91662111900223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 20.0625, "completions/mean_terminated_length": 20.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.13535173642030277, "frac_reward_zero_std": 0.75, "grad_norm": 0.18480771780014038, "kl": 2.6496730744838715, "learning_rate": 4.807026224641267e-06, "loss": 0.13482548296451569, "num_tokens": 2677199.0, "reward": 0.11250000074505806, "reward_std": 0.27998724579811096, "rewards/reward_financial_reasoning/mean": 0.11250000074505806, "rewards/reward_financial_reasoning/std": 0.27998724579811096, "step": 304, "step_time": 18.568712541000423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.5625, "completions/mean_terminated_length": 13.5625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.13624220837043632, "frac_reward_zero_std": 0.75, "grad_norm": 0.08845102041959763, "kl": 3.689025968313217, "learning_rate": 4.8020781791192486e-06, "loss": 0.16846482455730438, "num_tokens": 2697064.0, "reward": 0.012500002980232239, "reward_std": 0.5139711201190948, "rewards/reward_financial_reasoning/mean": 0.012500002980232239, "rewards/reward_financial_reasoning/std": 0.513971135020256, "step": 306, "step_time": 16.80304030750267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1371326803205699, "frac_reward_zero_std": 1.0, "grad_norm": 2.1612541675567627, "kl": 2.348564103245735, "learning_rate": 4.7971301335972295e-06, "loss": 0.09607716649770737, "num_tokens": 2714328.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 308, "step_time": 15.943128220002109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 29.5, "completions/max_terminated_length": 29.5, "completions/mean_length": 18.4375, "completions/mean_terminated_length": 18.4375, "completions/min_length": 3.5, "completions/min_terminated_length": 3.5, "epoch": 0.13802315227070347, "frac_reward_zero_std": 0.5, "grad_norm": 0.14036618173122406, "kl": 4.25106380879879, "learning_rate": 4.7921820880752105e-06, "loss": 0.06849891692399979, "num_tokens": 2729847.0, "reward": 0.08750001154839993, "reward_std": 0.33053525537252426, "rewards/reward_financial_reasoning/mean": 0.08750001154839993, "rewards/reward_financial_reasoning/std": 0.33053525537252426, "step": 310, "step_time": 16.476932208006474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.5, "completions/max_terminated_length": 11.5, "completions/mean_length": 8.5, "completions/mean_terminated_length": 8.5, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.13891362422083706, "frac_reward_zero_std": 0.75, "grad_norm": 5.081897735595703, "kl": 2.9397522509098053, "learning_rate": 4.787234042553192e-06, "loss": 0.1418164074420929, "num_tokens": 2745783.0, "reward": -0.012499995529651642, "reward_std": 0.5226411670446396, "rewards/reward_financial_reasoning/mean": -0.012499995529651642, "rewards/reward_financial_reasoning/std": 0.5226411670446396, "step": 312, "step_time": 12.007526194989623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.3125, "completions/mean_terminated_length": 13.3125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.13980409617097062, "frac_reward_zero_std": 1.0, "grad_norm": 0.1045272946357727, "kl": 2.600666582584381, "learning_rate": 4.782285997031173e-06, "loss": 0.09802389144897461, "num_tokens": 2761556.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 314, "step_time": 12.98709403751127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 139.5, "completions/max_terminated_length": 78.5, "completions/mean_length": 38.0, "completions/mean_terminated_length": 24.125, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.14069456812110417, "frac_reward_zero_std": 0.75, "grad_norm": 1.935442328453064, "kl": 4.464976981282234, "learning_rate": 4.777337951509154e-06, "loss": 0.09435869753360748, "num_tokens": 2781652.0, "reward": -0.08750000037252903, "reward_std": 0.18624438345432281, "rewards/reward_financial_reasoning/mean": -0.08750000037252903, "rewards/reward_financial_reasoning/std": 0.1862443909049034, "step": 316, "step_time": 49.13465615150926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.8125, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 41.8125, "completions/mean_terminated_length": 11.214286088943481, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.14158504007123776, "frac_reward_zero_std": 0.5, "grad_norm": 6.196547985076904, "kl": 3.587283805012703, "learning_rate": 4.772389905987135e-06, "loss": 0.14025059342384338, "num_tokens": 2797705.0, "reward": 0.050000011920928955, "reward_std": 0.42308472096920013, "rewards/reward_financial_reasoning/mean": 0.050000011920928955, "rewards/reward_financial_reasoning/std": 0.4230847507715225, "step": 318, "step_time": 78.31156018701222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.75, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 24.75, "completions/mean_terminated_length": 9.142857313156128, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.14247551202137132, "frac_reward_zero_std": 0.5, "grad_norm": 3.3096694946289062, "kl": 5.908802837133408, "learning_rate": 4.767441860465117e-06, "loss": 0.3304884731769562, "num_tokens": 2815309.0, "reward": 0.625, "reward_std": 0.40620189905166626, "rewards/reward_financial_reasoning/mean": 0.625, "rewards/reward_financial_reasoning/std": 0.40620194375514984, "step": 320, "step_time": 46.993786041497515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1433659839715049, "frac_reward_zero_std": 1.0, "grad_norm": 0.24115696549415588, "kl": 4.057637929916382, "learning_rate": 4.762493814943098e-06, "loss": 0.14424464106559753, "num_tokens": 2827581.0, "reward": 0.09999999962747097, "reward_std": 0.37416573613882065, "rewards/reward_financial_reasoning/mean": 0.09999999962747097, "rewards/reward_financial_reasoning/std": 0.37416573613882065, "step": 322, "step_time": 12.003390189493075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 12.0625, "completions/mean_terminated_length": 12.0625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.14425645592163847, "frac_reward_zero_std": 0.75, "grad_norm": 3.445495843887329, "kl": 4.4983391761779785, "learning_rate": 4.757545769421079e-06, "loss": 0.0963294506072998, "num_tokens": 2849382.0, "reward": 0.0625000149011612, "reward_std": 0.28618598729372025, "rewards/reward_financial_reasoning/mean": 0.0625000149011612, "rewards/reward_financial_reasoning/std": 0.28618600964546204, "step": 324, "step_time": 18.020186542500596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.125, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 30.125, "completions/mean_terminated_length": 30.125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.14514692787177205, "frac_reward_zero_std": 0.75, "grad_norm": 1.5247211456298828, "kl": 2.5776854157447815, "learning_rate": 4.75259772389906e-06, "loss": 0.1263631284236908, "num_tokens": 2870536.0, "reward": 0.08750000223517418, "reward_std": 0.1642080545425415, "rewards/reward_financial_reasoning/mean": 0.08750000223517418, "rewards/reward_financial_reasoning/std": 0.1642080694437027, "step": 326, "step_time": 41.201073117990745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 29.5, "completions/max_terminated_length": 29.5, "completions/mean_length": 15.3125, "completions/mean_terminated_length": 15.3125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1460373998219056, "frac_reward_zero_std": 1.0, "grad_norm": 0.31851768493652344, "kl": 4.2002551555633545, "learning_rate": 4.7476496783770416e-06, "loss": 0.1324121206998825, "num_tokens": 2879789.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 328, "step_time": 13.552819680997345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.875, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.875, "completions/mean_terminated_length": 20.875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.14692787177203917, "frac_reward_zero_std": 1.0, "grad_norm": 0.08847746253013611, "kl": 4.410182029008865, "learning_rate": 4.7427016328550225e-06, "loss": 0.1517459750175476, "num_tokens": 2901283.0, "reward": 0.32500000670552254, "reward_std": 0.24053511023521423, "rewards/reward_financial_reasoning/mean": 0.32500000670552254, "rewards/reward_financial_reasoning/std": 0.24053512513637543, "step": 330, "step_time": 21.379759306495544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.1875, "completions/clipped_ratio": 0.0625, "completions/max_length": 133.5, "completions/max_terminated_length": 10.0, "completions/mean_length": 23.1875, "completions/mean_terminated_length": 7.517857313156128, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.14781834372217276, "frac_reward_zero_std": 0.75, "grad_norm": 9.072896003723145, "kl": 5.56328746676445, "learning_rate": 4.7377535873330035e-06, "loss": 0.12063997983932495, "num_tokens": 2914286.0, "reward": 0.18750000558793545, "reward_std": 0.3389529511332512, "rewards/reward_financial_reasoning/mean": 0.18750000558793545, "rewards/reward_financial_reasoning/std": 0.3389529809355736, "step": 332, "step_time": 43.318069253498834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.14870881567230632, "frac_reward_zero_std": 1.0, "grad_norm": 0.5086295008659363, "kl": 2.4566811472177505, "learning_rate": 4.732805541810985e-06, "loss": 0.0956970602273941, "num_tokens": 2933166.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 334, "step_time": 16.838145778496255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.1875, "completions/clipped_ratio": 0.0625, "completions/max_length": 141.5, "completions/max_terminated_length": 35.0, "completions/mean_length": 35.1875, "completions/mean_terminated_length": 20.10714340209961, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1495992876224399, "frac_reward_zero_std": 0.75, "grad_norm": 11.059113502502441, "kl": 3.152929961681366, "learning_rate": 4.727857496288966e-06, "loss": 0.1890815645456314, "num_tokens": 2951673.0, "reward": 0.11250000447034836, "reward_std": 0.31000544875860214, "rewards/reward_financial_reasoning/mean": 0.11250000447034836, "rewards/reward_financial_reasoning/std": 0.3100054860115051, "step": 336, "step_time": 50.63826839199464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.4375, "completions/mean_terminated_length": 12.4375, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.15048975957257346, "frac_reward_zero_std": 0.75, "grad_norm": 3.7672529220581055, "kl": 2.8252325542271137, "learning_rate": 4.722909450766948e-06, "loss": 0.07159600406885147, "num_tokens": 2967208.0, "reward": 0.16249998658895493, "reward_std": 0.4624329060316086, "rewards/reward_financial_reasoning/mean": 0.16249998658895493, "rewards/reward_financial_reasoning/std": 0.4624328762292862, "step": 338, "step_time": 14.010528551505558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.15138023152270705, "frac_reward_zero_std": 1.0, "grad_norm": 9.698003768920898, "kl": 4.801441490650177, "learning_rate": 4.717961405244928e-06, "loss": 0.19205763936042786, "num_tokens": 2983600.0, "reward": -0.12500000186264515, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.12500000186264515, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 340, "step_time": 14.301874094995583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.1522707034728406, "frac_reward_zero_std": 1.0, "grad_norm": 0.16439253091812134, "kl": 5.160941481590271, "learning_rate": 4.71301335972291e-06, "loss": 0.1990242302417755, "num_tokens": 2999064.0, "reward": -0.07500000298023224, "reward_std": 0.45434411615133286, "rewards/reward_financial_reasoning/mean": -0.07500000298023224, "rewards/reward_financial_reasoning/std": 0.45434411615133286, "step": 342, "step_time": 13.344863591006288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.15316117542297417, "frac_reward_zero_std": 0.75, "grad_norm": 4.908713340759277, "kl": 5.427981078624725, "learning_rate": 4.708065314200891e-06, "loss": 0.20119166374206543, "num_tokens": 3010456.0, "reward": 0.7125000208616257, "reward_std": 0.22243821248412132, "rewards/reward_financial_reasoning/mean": 0.7125000208616257, "rewards/reward_financial_reasoning/std": 0.22243822365999222, "step": 344, "step_time": 10.042046501504956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.375, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 17.375, "completions/mean_terminated_length": 17.375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.15405164737310775, "frac_reward_zero_std": 1.0, "grad_norm": 0.10223693400621414, "kl": 3.115888088941574, "learning_rate": 4.703117268678873e-06, "loss": 0.12509028613567352, "num_tokens": 3027478.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 346, "step_time": 15.962319723501423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 21.5625, "completions/mean_terminated_length": 21.5625, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.1549421193232413, "frac_reward_zero_std": 1.0, "grad_norm": 0.2353672981262207, "kl": 2.3216539919376373, "learning_rate": 4.698169223156854e-06, "loss": 0.09396439045667648, "num_tokens": 3046503.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 348, "step_time": 17.60655723549644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 24.4375, "completions/mean_terminated_length": 24.4375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1558325912733749, "frac_reward_zero_std": 0.75, "grad_norm": 0.0838085487484932, "kl": 3.7829702496528625, "learning_rate": 4.6932211776348345e-06, "loss": 0.1546584516763687, "num_tokens": 3064462.0, "reward": 0.0, "reward_std": 0.32293298840522766, "rewards/reward_financial_reasoning/mean": 0.0, "rewards/reward_financial_reasoning/std": 0.32293298840522766, "step": 350, "step_time": 19.438781550499698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.875, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 13.875, "completions/mean_terminated_length": 13.875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.15672306322350846, "frac_reward_zero_std": 0.75, "grad_norm": 6.361366271972656, "kl": 3.5697886049747467, "learning_rate": 4.688273132112816e-06, "loss": 0.024487487971782684, "num_tokens": 3085988.0, "reward": 0.16249999962747097, "reward_std": 0.327665738761425, "rewards/reward_financial_reasoning/mean": 0.16249999962747097, "rewards/reward_financial_reasoning/std": 0.3276657685637474, "step": 352, "step_time": 16.988179097501416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 15.75, "completions/mean_terminated_length": 15.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.15761353517364202, "frac_reward_zero_std": 1.0, "grad_norm": 0.11495406925678253, "kl": 2.6656560599803925, "learning_rate": 4.683325086590797e-06, "loss": 0.10649028420448303, "num_tokens": 3108080.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 354, "step_time": 17.6394682640057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.5, "completions/max_terminated_length": 29.5, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1585040071237756, "frac_reward_zero_std": 1.0, "grad_norm": 0.16622397303581238, "kl": 2.5759568214416504, "learning_rate": 4.678377041068778e-06, "loss": 0.104760080575943, "num_tokens": 3123224.0, "reward": 0.17500000074505806, "reward_std": 0.29398736357688904, "rewards/reward_financial_reasoning/mean": 0.17500000074505806, "rewards/reward_financial_reasoning/std": 0.29398736357688904, "step": 356, "step_time": 16.088501062498835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 15.4375, "completions/mean_terminated_length": 15.4375, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.15939447907390916, "frac_reward_zero_std": 1.0, "grad_norm": 0.4403320550918579, "kl": 2.555238038301468, "learning_rate": 4.673428995546759e-06, "loss": 0.08659804612398148, "num_tokens": 3145647.0, "reward": 0.3750000149011612, "reward_std": 0.45434410870075226, "rewards/reward_financial_reasoning/mean": 0.3750000149011612, "rewards/reward_financial_reasoning/std": 0.45434412360191345, "step": 358, "step_time": 19.564347899991844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.1875, "completions/mean_terminated_length": 20.1875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.16028495102404275, "frac_reward_zero_std": 1.0, "grad_norm": 0.5063008069992065, "kl": 2.5525257289409637, "learning_rate": 4.668480950024741e-06, "loss": 0.10185902565717697, "num_tokens": 3166106.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 360, "step_time": 17.824098634006077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.25, "completions/clipped_ratio": 0.0625, "completions/max_length": 137.5, "completions/max_terminated_length": 56.5, "completions/mean_length": 36.25, "completions/mean_terminated_length": 22.375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1611754229741763, "frac_reward_zero_std": 0.75, "grad_norm": 0.8783189058303833, "kl": 3.123973160982132, "learning_rate": 4.663532904502722e-06, "loss": 0.15637540817260742, "num_tokens": 3186542.0, "reward": -0.0875000013038516, "reward_std": 0.266422763466835, "rewards/reward_financial_reasoning/mean": -0.0875000013038516, "rewards/reward_financial_reasoning/std": 0.266422763466835, "step": 362, "step_time": 51.280559887498384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.625, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 10.625, "completions/mean_terminated_length": 10.625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.1620658949243099, "frac_reward_zero_std": 1.0, "grad_norm": 0.6702510714530945, "kl": 5.710592150688171, "learning_rate": 4.658584858980703e-06, "loss": 0.21924875676631927, "num_tokens": 3203200.0, "reward": -0.30000000447034836, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -0.30000000447034836, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 364, "step_time": 13.775912964996678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.16295636687444345, "frac_reward_zero_std": 1.0, "grad_norm": 0.2933957278728485, "kl": 6.98069429397583, "learning_rate": 4.653636813458684e-06, "loss": 0.2750336825847626, "num_tokens": 3223940.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 366, "step_time": 17.495442778003053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.5, "completions/max_terminated_length": 44.5, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.163846838824577, "frac_reward_zero_std": 1.0, "grad_norm": 0.2866394519805908, "kl": 3.2483417838811874, "learning_rate": 4.648688767936666e-06, "loss": 0.11545059829950333, "num_tokens": 3246244.0, "reward": 0.22500000149011612, "reward_std": 0.34743960946798325, "rewards/reward_financial_reasoning/mean": 0.22500000149011612, "rewards/reward_financial_reasoning/std": 0.34743963181972504, "step": 368, "step_time": 25.16599783250058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.75, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 12.75, "completions/mean_terminated_length": 12.75, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.1647373107747106, "frac_reward_zero_std": 1.0, "grad_norm": 0.18002893030643463, "kl": 3.9817181825637817, "learning_rate": 4.643740722414647e-06, "loss": 0.1207280308008194, "num_tokens": 3263880.0, "reward": -0.22500000149011612, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": -0.22500000149011612, "rewards/reward_financial_reasoning/std": 0.18708287179470062, "step": 370, "step_time": 15.068639872995846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 13.0625, "completions/mean_terminated_length": 13.0625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.16562778272484416, "frac_reward_zero_std": 1.0, "grad_norm": 0.23808419704437256, "kl": 4.9848949164152145, "learning_rate": 4.6387926768926275e-06, "loss": 0.1741630584001541, "num_tokens": 3276721.0, "reward": 0.6749999970197678, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": 0.6749999970197678, "rewards/reward_financial_reasoning/std": 0.24053513258695602, "step": 372, "step_time": 11.880487833997904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 22.3125, "completions/mean_terminated_length": 22.3125, "completions/min_length": 19.5, "completions/min_terminated_length": 19.5, "epoch": 0.16651825467497774, "frac_reward_zero_std": 1.0, "grad_norm": 0.23066724836826324, "kl": 2.4786416590213776, "learning_rate": 4.633844631370609e-06, "loss": 0.09922318905591965, "num_tokens": 3297198.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 374, "step_time": 18.64603684699614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 38.1875, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.5, "completions/max_terminated_length": 94.0, "completions/mean_length": 38.1875, "completions/mean_terminated_length": 24.455358505249023, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.1674087266251113, "frac_reward_zero_std": 1.0, "grad_norm": 0.3857172429561615, "kl": 2.536154806613922, "learning_rate": 4.62889658584859e-06, "loss": 0.08426443487405777, "num_tokens": 3326601.0, "reward": -0.20000000298023224, "reward_std": 0.32071349024772644, "rewards/reward_financial_reasoning/mean": -0.20000000298023224, "rewards/reward_financial_reasoning/std": 0.32071349024772644, "step": 376, "step_time": 58.15682624900728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 11.4375, "completions/mean_terminated_length": 11.4375, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.1682991985752449, "frac_reward_zero_std": 0.75, "grad_norm": 0.25235873460769653, "kl": 4.639036536216736, "learning_rate": 4.623948540326571e-06, "loss": 0.20123405754566193, "num_tokens": 3338424.0, "reward": 0.02500000037252903, "reward_std": 0.2121320366859436, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.2121320366859436, "step": 378, "step_time": 10.856103686997812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.125, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 19.125, "completions/mean_terminated_length": 19.125, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.16918967052537845, "frac_reward_zero_std": 1.0, "grad_norm": 0.11810299009084702, "kl": 2.776392698287964, "learning_rate": 4.619000494804552e-06, "loss": 0.10224653035402298, "num_tokens": 3350586.0, "reward": -0.22500000149011612, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": -0.22500000149011612, "rewards/reward_financial_reasoning/std": 0.18708287179470062, "step": 380, "step_time": 14.638130175502738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.5, "completions/max_terminated_length": 43.5, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.170080142475512, "frac_reward_zero_std": 0.75, "grad_norm": 0.19874098896980286, "kl": 2.42767196893692, "learning_rate": 4.614052449282534e-06, "loss": 0.05534215644001961, "num_tokens": 3367010.0, "reward": 0.2000000085681677, "reward_std": 0.3259558826684952, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.32595589756965637, "step": 382, "step_time": 21.40752187149701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 17.8125, "completions/mean_terminated_length": 17.8125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1709706144256456, "frac_reward_zero_std": 1.0, "grad_norm": 0.36154139041900635, "kl": 3.0273653864860535, "learning_rate": 4.609104403760515e-06, "loss": 0.11987186968326569, "num_tokens": 3383775.0, "reward": -0.22500000149011612, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": -0.22500000149011612, "rewards/reward_financial_reasoning/std": 0.18708287179470062, "step": 384, "step_time": 16.326452354001958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 12.1875, "completions/mean_terminated_length": 12.1875, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.17186108637577915, "frac_reward_zero_std": 1.0, "grad_norm": 0.19802482426166534, "kl": 3.4667557179927826, "learning_rate": 4.604156358238496e-06, "loss": 0.13101685047149658, "num_tokens": 3404386.0, "reward": -0.12500000186264515, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.12500000186264515, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 386, "step_time": 16.66828526950121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 16.75, "completions/mean_terminated_length": 16.75, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.17275155832591274, "frac_reward_zero_std": 1.0, "grad_norm": 0.17383332550525665, "kl": 2.8899713158607483, "learning_rate": 4.599208312716477e-06, "loss": 0.10997433215379715, "num_tokens": 3426190.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 388, "step_time": 19.155468865996227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 53.1875, "completions/clipped_ratio": 0.125, "completions/max_length": 151.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 53.1875, "completions/mean_terminated_length": 23.479166984558105, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1736420302760463, "frac_reward_zero_std": 1.0, "grad_norm": 3.8917670249938965, "kl": 3.196235477924347, "learning_rate": 4.594260267194459e-06, "loss": 0.10975559055805206, "num_tokens": 3448657.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 390, "step_time": 55.84161307200702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.125, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 35.125, "completions/mean_terminated_length": 20.500000953674316, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1745325022261799, "frac_reward_zero_std": 0.75, "grad_norm": 1.013735055923462, "kl": 3.2146300077438354, "learning_rate": 4.5893122216724396e-06, "loss": 0.24231016635894775, "num_tokens": 3465963.0, "reward": 0.02499999850988388, "reward_std": 0.4057116433978081, "rewards/reward_financial_reasoning/mean": 0.02499999850988388, "rewards/reward_financial_reasoning/std": 0.40571165084838867, "step": 392, "step_time": 48.684688639499655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.375, "completions/clipped_ratio": 0.0625, "completions/max_length": 132.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 27.375, "completions/mean_terminated_length": 12.4375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.17542297417631345, "frac_reward_zero_std": 1.0, "grad_norm": 21.446544647216797, "kl": 3.7770427837967873, "learning_rate": 4.584364176150421e-06, "loss": 0.14160798490047455, "num_tokens": 3482145.0, "reward": 0.12500000558793545, "reward_std": 0.34743960946798325, "rewards/reward_financial_reasoning/mean": 0.12500000558793545, "rewards/reward_financial_reasoning/std": 0.34743960946798325, "step": 394, "step_time": 46.83634573549716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.125, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.125, "completions/mean_terminated_length": 14.125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.176313446126447, "frac_reward_zero_std": 1.0, "grad_norm": 0.30426764488220215, "kl": 2.9290217757225037, "learning_rate": 4.579416130628402e-06, "loss": 0.10413938015699387, "num_tokens": 3495651.0, "reward": 0.45000001788139343, "reward_std": 0.37416574358940125, "rewards/reward_financial_reasoning/mean": 0.45000001788139343, "rewards/reward_financial_reasoning/std": 0.37416577339172363, "step": 396, "step_time": 13.700383353498182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.0625, "completions/mean_terminated_length": 20.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1772039180765806, "frac_reward_zero_std": 0.75, "grad_norm": 0.12722289562225342, "kl": 2.9362562894821167, "learning_rate": 4.574468085106383e-06, "loss": 0.10540154576301575, "num_tokens": 3517420.0, "reward": 0.06250000093132257, "reward_std": 0.1060660183429718, "rewards/reward_financial_reasoning/mean": 0.06250000093132257, "rewards/reward_financial_reasoning/std": 0.1060660183429718, "step": 398, "step_time": 21.081551555500482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 78.625, "completions/clipped_ratio": 0.25, "completions/max_length": 143.5, "completions/max_terminated_length": 27.0, "completions/mean_length": 78.625, "completions/mean_terminated_length": 20.375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.17809439002671415, "frac_reward_zero_std": 0.75, "grad_norm": 3.1222035884857178, "kl": 4.669141083955765, "learning_rate": 4.569520039584365e-06, "loss": 0.18662548065185547, "num_tokens": 3539174.0, "reward": 0.08749999664723873, "reward_std": 0.3001621440052986, "rewards/reward_financial_reasoning/mean": 0.08749999664723873, "rewards/reward_financial_reasoning/std": 0.3001621440052986, "step": 400, "step_time": 53.09678938449724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.17898486197684774, "frac_reward_zero_std": 1.0, "grad_norm": 0.3273642659187317, "kl": 3.055632010102272, "learning_rate": 4.564571994062346e-06, "loss": 0.12121474742889404, "num_tokens": 3558926.0, "reward": -0.15000000596046448, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": -0.15000000596046448, "rewards/reward_financial_reasoning/std": 0.26726125180721283, "step": 402, "step_time": 16.527376137997635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 61.25, "completions/clipped_ratio": 0.1875, "completions/max_length": 145.0, "completions/max_terminated_length": 27.5, "completions/mean_length": 61.25, "completions/mean_terminated_length": 16.625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1798753339269813, "frac_reward_zero_std": 1.0, "grad_norm": 0.6433550119400024, "kl": 2.9147322699427605, "learning_rate": 4.559623948540327e-06, "loss": 0.09967145323753357, "num_tokens": 3577754.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 404, "step_time": 51.886354368994944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 7.5, "completions/clipped_ratio": 0.0, "completions/max_length": 8.5, "completions/max_terminated_length": 8.5, "completions/mean_length": 7.5, "completions/mean_terminated_length": 7.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.18076580587711488, "frac_reward_zero_std": 0.75, "grad_norm": 0.7596104741096497, "kl": 5.5971426367759705, "learning_rate": 4.554675903018308e-06, "loss": 0.22060047090053558, "num_tokens": 3591514.0, "reward": 0.0, "reward_std": 0.1963960975408554, "rewards/reward_financial_reasoning/mean": 0.0, "rewards/reward_financial_reasoning/std": 0.1963961124420166, "step": 406, "step_time": 10.784611634997418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.18165627782724844, "frac_reward_zero_std": 1.0, "grad_norm": 0.2548048496246338, "kl": 2.3614984899759293, "learning_rate": 4.54972785749629e-06, "loss": 0.09047777950763702, "num_tokens": 3603050.0, "reward": -0.20000000298023224, "reward_std": 0.32071349024772644, "rewards/reward_financial_reasoning/mean": -0.20000000298023224, "rewards/reward_financial_reasoning/std": 0.32071349024772644, "step": 408, "step_time": 10.890331844006141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.8125, "completions/clipped_ratio": 0.125, "completions/max_length": 146.0, "completions/max_terminated_length": 27.5, "completions/mean_length": 48.8125, "completions/mean_terminated_length": 19.08333396911621, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.182546749777382, "frac_reward_zero_std": 0.75, "grad_norm": 0.14107643067836761, "kl": 3.473247766494751, "learning_rate": 4.544779811974271e-06, "loss": 0.24801112711429596, "num_tokens": 3625639.0, "reward": 0.0, "reward_std": 0.35675284266471863, "rewards/reward_financial_reasoning/mean": 0.0, "rewards/reward_financial_reasoning/std": 0.3567528575658798, "step": 410, "step_time": 54.626304682009504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 19.5, "completions/min_terminated_length": 19.5, "epoch": 0.1834372217275156, "frac_reward_zero_std": 1.0, "grad_norm": 0.7728067636489868, "kl": 2.5951511412858963, "learning_rate": 4.539831766452252e-06, "loss": 0.10369659960269928, "num_tokens": 3649347.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 412, "step_time": 19.267440424002416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 26.5, "completions/max_terminated_length": 26.5, "completions/mean_length": 19.8125, "completions/mean_terminated_length": 19.8125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.18432769367764915, "frac_reward_zero_std": 1.0, "grad_norm": 0.2641366422176361, "kl": 2.1459864526987076, "learning_rate": 4.5348837209302326e-06, "loss": 0.08479943871498108, "num_tokens": 3661528.0, "reward": 0.45000001788139343, "reward_std": 0.37416574358940125, "rewards/reward_financial_reasoning/mean": 0.45000001788139343, "rewards/reward_financial_reasoning/std": 0.37416577339172363, "step": 414, "step_time": 13.814018774002761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 13.0625, "completions/mean_terminated_length": 13.0625, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.18521816562778273, "frac_reward_zero_std": 1.0, "grad_norm": 0.4945250153541565, "kl": 2.9109133034944534, "learning_rate": 4.529935675408214e-06, "loss": 0.11534873396158218, "num_tokens": 3683425.0, "reward": 0.02499999850988388, "reward_std": 0.45434408634901047, "rewards/reward_financial_reasoning/mean": 0.02499999850988388, "rewards/reward_financial_reasoning/std": 0.45434409379959106, "step": 416, "step_time": 17.516358232995117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 46.5625, "completions/mean_terminated_length": 46.5625, "completions/min_length": 26.5, "completions/min_terminated_length": 26.5, "epoch": 0.1861086375779163, "frac_reward_zero_std": 1.0, "grad_norm": 0.4000868797302246, "kl": 2.132822684943676, "learning_rate": 4.524987629886195e-06, "loss": 0.08503614366054535, "num_tokens": 3699290.0, "reward": 0.07499999552965164, "reward_std": 0.40089186280965805, "rewards/reward_financial_reasoning/mean": 0.07499999552965164, "rewards/reward_financial_reasoning/std": 0.40089187026023865, "step": 418, "step_time": 25.094515656495787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.8125, "completions/clipped_ratio": 0.0625, "completions/max_length": 140.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 30.8125, "completions/mean_terminated_length": 15.785714626312256, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.18699910952804988, "frac_reward_zero_std": 0.75, "grad_norm": 0.17446395754814148, "kl": 3.0325257033109665, "learning_rate": 4.520039584364176e-06, "loss": 0.2618591785430908, "num_tokens": 3718951.0, "reward": 0.2875000089406967, "reward_std": 0.4670701175928116, "rewards/reward_financial_reasoning/mean": 0.2875000089406967, "rewards/reward_financial_reasoning/std": 0.4670701324939728, "step": 420, "step_time": 51.2821328200007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 9.3125, "completions/mean_terminated_length": 9.3125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.18788958147818344, "frac_reward_zero_std": 0.5, "grad_norm": 8.17707347869873, "kl": 7.205821305513382, "learning_rate": 4.515091538842158e-06, "loss": 0.450656533241272, "num_tokens": 3734572.0, "reward": 0.11250001192092896, "reward_std": 0.4884578585624695, "rewards/reward_financial_reasoning/mean": 0.11250001192092896, "rewards/reward_financial_reasoning/std": 0.48845788836479187, "step": 422, "step_time": 16.285566200003814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.125, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 12.125, "completions/mean_terminated_length": 12.125, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.188780053428317, "frac_reward_zero_std": 1.0, "grad_norm": 0.2911452353000641, "kl": 4.378904566168785, "learning_rate": 4.510143493320139e-06, "loss": 0.14741505682468414, "num_tokens": 3750030.0, "reward": 0.20000000670552254, "reward_std": 0.26726123690605164, "rewards/reward_financial_reasoning/mean": 0.20000000670552254, "rewards/reward_financial_reasoning/std": 0.26726123690605164, "step": 424, "step_time": 15.573336581994226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 11.6875, "completions/mean_terminated_length": 11.6875, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.18967052537845058, "frac_reward_zero_std": 1.0, "grad_norm": 2.824861764907837, "kl": 5.281881749629974, "learning_rate": 4.50519544779812e-06, "loss": 0.2081257700920105, "num_tokens": 3764001.0, "reward": 0.25, "reward_std": 0.6948792338371277, "rewards/reward_financial_reasoning/mean": 0.25, "rewards/reward_financial_reasoning/std": 0.6948792338371277, "step": 426, "step_time": 13.06527343300695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 12.5, "completions/max_terminated_length": 12.5, "completions/mean_length": 10.0625, "completions/mean_terminated_length": 10.0625, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.19056099732858414, "frac_reward_zero_std": 1.0, "grad_norm": 0.71004319190979, "kl": 3.232193171977997, "learning_rate": 4.500247402276101e-06, "loss": 0.12972982227802277, "num_tokens": 3779722.0, "reward": 0.1250000037252903, "reward_std": 0.34743958711624146, "rewards/reward_financial_reasoning/mean": 0.1250000037252903, "rewards/reward_financial_reasoning/std": 0.34743958711624146, "step": 428, "step_time": 12.194943665497703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.125, "completions/clipped_ratio": 0.0, "completions/max_length": 30.5, "completions/max_terminated_length": 30.5, "completions/mean_length": 18.125, "completions/mean_terminated_length": 18.125, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.19145146927871773, "frac_reward_zero_std": 1.0, "grad_norm": 1.1838688850402832, "kl": 2.4279059320688248, "learning_rate": 4.495299356754083e-06, "loss": 0.0850219801068306, "num_tokens": 3793124.0, "reward": 0.4999999888241291, "reward_std": 0.05345224589109421, "rewards/reward_financial_reasoning/mean": 0.4999999888241291, "rewards/reward_financial_reasoning/std": 0.05345224589109421, "step": 430, "step_time": 16.18135965249894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 15.3125, "completions/mean_terminated_length": 15.3125, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.1923419412288513, "frac_reward_zero_std": 1.0, "grad_norm": 0.6881890892982483, "kl": 3.365168124437332, "learning_rate": 4.490351311232064e-06, "loss": 0.1304081380367279, "num_tokens": 3814681.0, "reward": -0.125, "reward_std": 0.29398736357688904, "rewards/reward_financial_reasoning/mean": -0.125, "rewards/reward_financial_reasoning/std": 0.29398736357688904, "step": 432, "step_time": 18.796456829495582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.3125, "completions/mean_terminated_length": 20.3125, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.19323241317898487, "frac_reward_zero_std": 1.0, "grad_norm": 8.092527389526367, "kl": 2.5298050343990326, "learning_rate": 4.485403265710045e-06, "loss": 0.09655264765024185, "num_tokens": 3831206.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 434, "step_time": 16.56057541100381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0625, "completions/max_length": 134.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 10.535714626312256, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.19412288512911843, "frac_reward_zero_std": 0.75, "grad_norm": 1.42697274684906, "kl": 2.7040752321481705, "learning_rate": 4.480455220188026e-06, "loss": 0.14201709628105164, "num_tokens": 3845242.0, "reward": 0.23750001192092896, "reward_std": 0.5119454711675644, "rewards/reward_financial_reasoning/mean": 0.23750001192092896, "rewards/reward_financial_reasoning/std": 0.5119454860687256, "step": 436, "step_time": 43.56783085800271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 33.5, "completions/max_terminated_length": 33.5, "completions/mean_length": 21.8125, "completions/mean_terminated_length": 21.8125, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.195013357079252, "frac_reward_zero_std": 1.0, "grad_norm": 0.33076462149620056, "kl": 1.9392491281032562, "learning_rate": 4.475507174666007e-06, "loss": 0.06251571327447891, "num_tokens": 3858575.0, "reward": 0.1250000037252903, "reward_std": 0.34743958711624146, "rewards/reward_financial_reasoning/mean": 0.1250000037252903, "rewards/reward_financial_reasoning/std": 0.34743958711624146, "step": 438, "step_time": 16.1721732709957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.6875, "completions/clipped_ratio": 0.1875, "completions/max_length": 150.0, "completions/max_terminated_length": 50.5, "completions/mean_length": 64.6875, "completions/mean_terminated_length": 21.225000381469727, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.19590382902938558, "frac_reward_zero_std": 1.0, "grad_norm": 971.8073120117188, "kl": 15.128244251012802, "learning_rate": 4.470559129143988e-06, "loss": 1.0460975170135498, "num_tokens": 3892042.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 440, "step_time": 63.478058283501014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.19679430097951914, "frac_reward_zero_std": 1.0, "grad_norm": 0.16787387430667877, "kl": 4.265814155340195, "learning_rate": 4.465611083621969e-06, "loss": 0.149496391415596, "num_tokens": 3907522.0, "reward": 0.0, "reward_std": 0.37416573613882065, "rewards/reward_financial_reasoning/mean": 0.0, "rewards/reward_financial_reasoning/std": 0.37416573613882065, "step": 442, "step_time": 16.013422278996586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.375, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 24.375, "completions/mean_terminated_length": 24.375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.19768477292965272, "frac_reward_zero_std": 0.75, "grad_norm": 6.1105194091796875, "kl": 2.2890444099903107, "learning_rate": 4.460663038099951e-06, "loss": 0.10768984258174896, "num_tokens": 3926016.0, "reward": 1.4901161193847656e-08, "reward_std": 0.27705904096364975, "rewards/reward_financial_reasoning/mean": 1.4901161193847656e-08, "rewards/reward_financial_reasoning/std": 0.27705905586481094, "step": 444, "step_time": 20.03814392700224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 48.5, "completions/max_terminated_length": 48.5, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 23.5, "completions/min_terminated_length": 23.5, "epoch": 0.19857524487978628, "frac_reward_zero_std": 0.75, "grad_norm": 4.598520278930664, "kl": 2.6320609748363495, "learning_rate": 4.455714992577932e-06, "loss": 0.11514586955308914, "num_tokens": 3946592.0, "reward": 0.32500001043081284, "reward_std": 0.38347896933555603, "rewards/reward_financial_reasoning/mean": 0.32500001043081284, "rewards/reward_financial_reasoning/std": 0.3834789991378784, "step": 446, "step_time": 25.14641531599409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0625, "completions/max_length": 133.5, "completions/max_terminated_length": 13.0, "completions/mean_length": 27.25, "completions/mean_terminated_length": 12.116071701049805, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.19946571682991987, "frac_reward_zero_std": 0.75, "grad_norm": 0.15830379724502563, "kl": 2.5864310264587402, "learning_rate": 4.450766947055914e-06, "loss": 0.17798259854316711, "num_tokens": 3967716.0, "reward": -0.0875000013038516, "reward_std": 0.266422763466835, "rewards/reward_financial_reasoning/mean": -0.0875000013038516, "rewards/reward_financial_reasoning/std": 0.266422763466835, "step": 448, "step_time": 49.958261707997735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.3125, "completions/clipped_ratio": 0.125, "completions/max_length": 133.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 43.3125, "completions/mean_terminated_length": 13.833333969116211, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.20035618878005343, "frac_reward_zero_std": 0.75, "grad_norm": 8.354418754577637, "kl": 3.437360942363739, "learning_rate": 4.445818901533894e-06, "loss": 0.10762486606836319, "num_tokens": 3985121.0, "reward": 0.25000000558793545, "reward_std": 0.31163340061903, "rewards/reward_financial_reasoning/mean": 0.25000000558793545, "rewards/reward_financial_reasoning/std": 0.31163340061903, "step": 450, "step_time": 47.905381616998056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.5, "completions/max_terminated_length": 26.5, "completions/mean_length": 32.4375, "completions/mean_terminated_length": 17.705357551574707, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.201246660730187, "frac_reward_zero_std": 0.75, "grad_norm": 7.903286457061768, "kl": 2.700325161218643, "learning_rate": 4.440870856011876e-06, "loss": 0.1732444018125534, "num_tokens": 4005176.0, "reward": 0.15000000223517418, "reward_std": 0.1963960975408554, "rewards/reward_financial_reasoning/mean": 0.15000000223517418, "rewards/reward_financial_reasoning/std": 0.1963961124420166, "step": 452, "step_time": 49.246475757499866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 30.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 22.9375, "completions/mean_terminated_length": 22.9375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.20213713268032057, "frac_reward_zero_std": 1.0, "grad_norm": 0.10897312313318253, "kl": 2.5111473351716995, "learning_rate": 4.435922810489857e-06, "loss": 0.10052955150604248, "num_tokens": 4027079.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 454, "step_time": 20.917127861499466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 18.0625, "completions/mean_terminated_length": 18.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.20302760463045413, "frac_reward_zero_std": 1.0, "grad_norm": 0.2660609483718872, "kl": 3.365581303834915, "learning_rate": 4.4309747649678384e-06, "loss": 0.1130051389336586, "num_tokens": 4044392.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 456, "step_time": 17.129023396002594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.375, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 14.375, "completions/mean_terminated_length": 14.375, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.20391807658058772, "frac_reward_zero_std": 1.0, "grad_norm": 0.14781758189201355, "kl": 2.1308258026838303, "learning_rate": 4.426026719445819e-06, "loss": 0.08490869402885437, "num_tokens": 4061966.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 458, "step_time": 14.371157450499595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.125, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 26.125, "completions/mean_terminated_length": 26.125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.20480854853072128, "frac_reward_zero_std": 0.75, "grad_norm": 0.6753233671188354, "kl": 6.325722843408585, "learning_rate": 4.4210786739238e-06, "loss": 0.24554677307605743, "num_tokens": 4079328.0, "reward": 0.04999999701976776, "reward_std": 0.3033005967736244, "rewards/reward_financial_reasoning/mean": 0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.3033006191253662, "step": 460, "step_time": 17.608918988491496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.5, "completions/max_terminated_length": 27.5, "completions/mean_length": 36.4375, "completions/mean_terminated_length": 21.91964340209961, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.20569902048085487, "frac_reward_zero_std": 0.75, "grad_norm": 0.20720401406288147, "kl": 1.656369112432003, "learning_rate": 4.416130628401782e-06, "loss": 0.13556653261184692, "num_tokens": 4099591.0, "reward": -0.01249999925494194, "reward_std": 0.12464234232902527, "rewards/reward_financial_reasoning/mean": -0.01249999925494194, "rewards/reward_financial_reasoning/std": 0.12464234232902527, "step": 462, "step_time": 50.19382210749973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.125, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 13.125, "completions/mean_terminated_length": 13.125, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.20658949243098843, "frac_reward_zero_std": 1.0, "grad_norm": 0.4629965126514435, "kl": 2.888270825147629, "learning_rate": 4.411182582879763e-06, "loss": 0.10317643731832504, "num_tokens": 4114745.0, "reward": -0.22500000149011612, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": -0.22500000149011612, "rewards/reward_financial_reasoning/std": 0.18708287924528122, "step": 464, "step_time": 13.621162455001468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.6875, "completions/clipped_ratio": 0.125, "completions/max_length": 140.5, "completions/max_terminated_length": 25.0, "completions/mean_length": 46.6875, "completions/mean_terminated_length": 16.9375, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.20747996438112198, "frac_reward_zero_std": 0.5, "grad_norm": 3.16763973236084, "kl": 1.9613378066569567, "learning_rate": 4.406234537357744e-06, "loss": 0.14753587543964386, "num_tokens": 4139204.0, "reward": 0.11250001192092896, "reward_std": 0.44064295291900635, "rewards/reward_financial_reasoning/mean": 0.11250001192092896, "rewards/reward_financial_reasoning/std": 0.44064295291900635, "step": 466, "step_time": 54.44558242650237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.20837043633125557, "frac_reward_zero_std": 0.5, "grad_norm": 0.4123859703540802, "kl": 3.0889033526182175, "learning_rate": 4.401286491835725e-06, "loss": 0.04875720292329788, "num_tokens": 4157948.0, "reward": 0.20000001043081284, "reward_std": 0.36730900406837463, "rewards/reward_financial_reasoning/mean": 0.20000001043081284, "rewards/reward_financial_reasoning/std": 0.3673090487718582, "step": 468, "step_time": 15.994729412002926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.5, "completions/max_terminated_length": 31.5, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.20926090828138913, "frac_reward_zero_std": 1.0, "grad_norm": 1.7484197616577148, "kl": 3.3789178878068924, "learning_rate": 4.396338446313707e-06, "loss": 0.12866923213005066, "num_tokens": 4181444.0, "reward": -0.07500000484287739, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": -0.07500000484287739, "rewards/reward_financial_reasoning/std": 0.18708287924528122, "step": 470, "step_time": 22.79538112249793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 17.9375, "completions/mean_terminated_length": 17.9375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.21015138023152272, "frac_reward_zero_std": 1.0, "grad_norm": 1.1577832698822021, "kl": 2.395612269639969, "learning_rate": 4.391390400791688e-06, "loss": 0.09630811214447021, "num_tokens": 4198235.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 472, "step_time": 14.938532446500176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.25, "completions/clipped_ratio": 0.0, "completions/max_length": 44.5, "completions/max_terminated_length": 44.5, "completions/mean_length": 27.25, "completions/mean_terminated_length": 27.25, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.21104185218165628, "frac_reward_zero_std": 1.0, "grad_norm": 0.05807355418801308, "kl": 3.640577018260956, "learning_rate": 4.386442355269669e-06, "loss": 0.1443350613117218, "num_tokens": 4211927.0, "reward": -0.32500000298023224, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": -0.32500000298023224, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 474, "step_time": 19.14156578449547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.5, "completions/max_terminated_length": 33.5, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.21193232413178986, "frac_reward_zero_std": 0.75, "grad_norm": 6.008650779724121, "kl": 2.670556515455246, "learning_rate": 4.38149430974765e-06, "loss": 0.10395485162734985, "num_tokens": 4231151.0, "reward": -0.07500000298023224, "reward_std": 0.23106741905212402, "rewards/reward_financial_reasoning/mean": -0.07500000298023224, "rewards/reward_financial_reasoning/std": 0.23106742650270462, "step": 476, "step_time": 20.392167506000987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.125, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.125, "completions/mean_terminated_length": 10.125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.21282279608192342, "frac_reward_zero_std": 1.0, "grad_norm": 16.985912322998047, "kl": 7.489349842071533, "learning_rate": 4.376546264225631e-06, "loss": 0.2728106379508972, "num_tokens": 4245401.0, "reward": 0.12500000558793545, "reward_std": 0.34743960946798325, "rewards/reward_financial_reasoning/mean": 0.12500000558793545, "rewards/reward_financial_reasoning/std": 0.34743960946798325, "step": 478, "step_time": 11.78860704699764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.125, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 21.125, "completions/mean_terminated_length": 21.125, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.21371326803205698, "frac_reward_zero_std": 1.0, "grad_norm": 0.06662454456090927, "kl": 3.3793341517448425, "learning_rate": 4.371598218703612e-06, "loss": 0.12479593604803085, "num_tokens": 4265107.0, "reward": 0.17500000447034836, "reward_std": 0.29398737102746964, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2939873933792114, "step": 480, "step_time": 21.45671570050399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 37.5, "completions/max_terminated_length": 37.5, "completions/mean_length": 19.1875, "completions/mean_terminated_length": 19.1875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.21460373998219057, "frac_reward_zero_std": 1.0, "grad_norm": 0.21638722717761993, "kl": 4.512425392866135, "learning_rate": 4.366650173181593e-06, "loss": 0.12742410600185394, "num_tokens": 4284854.0, "reward": 0.22499999776482582, "reward_std": 0.34743958711624146, "rewards/reward_financial_reasoning/mean": 0.22499999776482582, "rewards/reward_financial_reasoning/std": 0.34743958711624146, "step": 482, "step_time": 21.60816939650249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 53.3125, "completions/clipped_ratio": 0.125, "completions/max_length": 149.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.3125, "completions/mean_terminated_length": 24.479166984558105, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.21549421193232413, "frac_reward_zero_std": 0.75, "grad_norm": 15.088199615478516, "kl": 2.1596968695521355, "learning_rate": 4.361702127659575e-06, "loss": 0.06029912829399109, "num_tokens": 4299651.0, "reward": 0.01250000111758709, "reward_std": 0.24438642710447311, "rewards/reward_financial_reasoning/mean": 0.01250000111758709, "rewards/reward_financial_reasoning/std": 0.2443864420056343, "step": 484, "step_time": 49.9494952740024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 31.5, "completions/max_terminated_length": 31.5, "completions/mean_length": 25.1875, "completions/mean_terminated_length": 25.1875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2163846838824577, "frac_reward_zero_std": 1.0, "grad_norm": 1.7525089979171753, "kl": 2.5929179787635803, "learning_rate": 4.356754082137556e-06, "loss": 0.10190180689096451, "num_tokens": 4317406.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 486, "step_time": 18.708130840001104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 12.9375, "completions/mean_terminated_length": 12.9375, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.21727515583259127, "frac_reward_zero_std": 1.0, "grad_norm": 0.8261609673500061, "kl": 3.737182319164276, "learning_rate": 4.351806036615537e-06, "loss": 0.12806250154972076, "num_tokens": 4339637.0, "reward": -0.20000000298023224, "reward_std": 0.32071349024772644, "rewards/reward_financial_reasoning/mean": -0.20000000298023224, "rewards/reward_financial_reasoning/std": 0.32071349024772644, "step": 488, "step_time": 19.100060859494988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 28.5, "completions/max_terminated_length": 28.5, "completions/mean_length": 22.5625, "completions/mean_terminated_length": 22.5625, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.21816562778272486, "frac_reward_zero_std": 1.0, "grad_norm": 0.06733864545822144, "kl": 3.426992893218994, "learning_rate": 4.346857991093518e-06, "loss": 0.1387546956539154, "num_tokens": 4354870.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 490, "step_time": 16.014215874009096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 18.4375, "completions/mean_terminated_length": 18.4375, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.21905609973285842, "frac_reward_zero_std": 1.0, "grad_norm": 0.10745098441839218, "kl": 12.205712288618088, "learning_rate": 4.3419099455715e-06, "loss": 0.3105950951576233, "num_tokens": 4368573.0, "reward": -0.12500000186264515, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.12500000186264515, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 492, "step_time": 13.937267904999317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 149.5, "completions/max_terminated_length": 29.5, "completions/mean_length": 35.0, "completions/mean_terminated_length": 19.785714626312256, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.21994657168299198, "frac_reward_zero_std": 1.0, "grad_norm": 0.07543536275625229, "kl": 2.3355341032147408, "learning_rate": 4.336961900049481e-06, "loss": 0.0785098671913147, "num_tokens": 4393805.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 494, "step_time": 57.094905161506176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 15.6875, "completions/mean_terminated_length": 15.6875, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.22083704363312556, "frac_reward_zero_std": 1.0, "grad_norm": 0.15516425669193268, "kl": 2.783940374851227, "learning_rate": 4.332013854527462e-06, "loss": 0.10796858370304108, "num_tokens": 4410792.0, "reward": -0.20000000298023224, "reward_std": 0.32071349024772644, "rewards/reward_financial_reasoning/mean": -0.20000000298023224, "rewards/reward_financial_reasoning/std": 0.32071349024772644, "step": 496, "step_time": 17.537851961504202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.5, "completions/max_terminated_length": 33.5, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.22172751558325912, "frac_reward_zero_std": 0.75, "grad_norm": 4.727818965911865, "kl": 3.4606142938137054, "learning_rate": 4.327065809005443e-06, "loss": 0.21134862303733826, "num_tokens": 4424176.0, "reward": 0.16250000894069672, "reward_std": 0.24571321159601212, "rewards/reward_financial_reasoning/mean": 0.16250000894069672, "rewards/reward_financial_reasoning/std": 0.2457132264971733, "step": 498, "step_time": 17.342474356501043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 131.5, "completions/max_terminated_length": 12.5, "completions/mean_length": 25.4375, "completions/mean_terminated_length": 10.285714626312256, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2226179875333927, "frac_reward_zero_std": 0.75, "grad_norm": 1.4800151586532593, "kl": 5.777998372912407, "learning_rate": 4.322117763483424e-06, "loss": 0.3806329369544983, "num_tokens": 4437359.0, "reward": -0.012499988079071045, "reward_std": 0.28327932208776474, "rewards/reward_financial_reasoning/mean": -0.012499988079071045, "rewards/reward_financial_reasoning/std": 0.2832793518900871, "step": 500, "step_time": 42.75106621949817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 145.5, "completions/max_terminated_length": 39.0, "completions/mean_length": 42.4375, "completions/mean_terminated_length": 28.46428680419922, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.22350845948352627, "frac_reward_zero_std": 0.5, "grad_norm": 3.514166831970215, "kl": 2.001025475561619, "learning_rate": 4.317169717961406e-06, "loss": 0.12764599919319153, "num_tokens": 4455462.0, "reward": 0.21249999850988388, "reward_std": 0.3181980475783348, "rewards/reward_financial_reasoning/mean": 0.21249999850988388, "rewards/reward_financial_reasoning/std": 0.3181980773806572, "step": 502, "step_time": 52.044455729494075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.5, "completions/clipped_ratio": 0.0, "completions/max_length": 11.5, "completions/max_terminated_length": 11.5, "completions/mean_length": 9.5, "completions/mean_terminated_length": 9.5, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.22439893143365983, "frac_reward_zero_std": 1.0, "grad_norm": 3.35060715675354, "kl": 2.7600596249103546, "learning_rate": 4.312221672439386e-06, "loss": 0.1091075986623764, "num_tokens": 4479406.0, "reward": -0.20000000298023224, "reward_std": 0.32071349024772644, "rewards/reward_financial_reasoning/mean": -0.20000000298023224, "rewards/reward_financial_reasoning/std": 0.32071349024772644, "step": 504, "step_time": 17.23378602399316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 58.5, "completions/max_terminated_length": 58.5, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2252894033837934, "frac_reward_zero_std": 1.0, "grad_norm": 0.17364293336868286, "kl": 2.8194206953048706, "learning_rate": 4.307273626917368e-06, "loss": 0.10312893986701965, "num_tokens": 4497138.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 506, "step_time": 25.97717865550294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.875, "completions/clipped_ratio": 0.0625, "completions/max_length": 142.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 32.875, "completions/mean_terminated_length": 17.535714626312256, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.22617987533392697, "frac_reward_zero_std": 0.75, "grad_norm": 3.222346782684326, "kl": 3.0017409920692444, "learning_rate": 4.302325581395349e-06, "loss": 0.2027941793203354, "num_tokens": 4521648.0, "reward": 0.11250000447034836, "reward_std": 0.31000544875860214, "rewards/reward_financial_reasoning/mean": 0.11250000447034836, "rewards/reward_financial_reasoning/std": 0.3100054860115051, "step": 508, "step_time": 53.93680636600038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 28.5, "completions/max_terminated_length": 28.5, "completions/mean_length": 17.8125, "completions/mean_terminated_length": 17.8125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.22707034728406056, "frac_reward_zero_std": 1.0, "grad_norm": 2410.09130859375, "kl": 113.49399599432945, "learning_rate": 4.297377535873331e-06, "loss": 3.160386323928833, "num_tokens": 4534861.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 510, "step_time": 14.666269088997069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 27.5, "completions/max_terminated_length": 27.5, "completions/mean_length": 21.6875, "completions/mean_terminated_length": 21.6875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.22796081923419412, "frac_reward_zero_std": 1.0, "grad_norm": 2.0670084953308105, "kl": 3.2094925343990326, "learning_rate": 4.292429490351312e-06, "loss": 0.1282799392938614, "num_tokens": 4552176.0, "reward": -0.04999999701976776, "reward_std": 0.37416571378707886, "rewards/reward_financial_reasoning/mean": -0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.37416571378707886, "step": 512, "step_time": 17.098706054504873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.25, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 14.25, "completions/mean_terminated_length": 14.25, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.2288512911843277, "frac_reward_zero_std": 1.0, "grad_norm": 0.08085216581821442, "kl": 2.5412717759609222, "learning_rate": 4.287481444829293e-06, "loss": 0.10157681256532669, "num_tokens": 4559796.0, "reward": 0.5750000029802322, "reward_std": 0.34743958711624146, "rewards/reward_financial_reasoning/mean": 0.5750000029802322, "rewards/reward_financial_reasoning/std": 0.34743958711624146, "step": 514, "step_time": 10.108604683504382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 15.5, "completions/max_terminated_length": 15.5, "completions/mean_length": 11.1875, "completions/mean_terminated_length": 11.1875, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.22974176313446126, "frac_reward_zero_std": 1.0, "grad_norm": 0.29972851276397705, "kl": 2.5984874963760376, "learning_rate": 4.282533399307274e-06, "loss": 0.10182797163724899, "num_tokens": 4579279.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 516, "step_time": 15.588283419001527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.375, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 20.375, "completions/mean_terminated_length": 20.375, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.23063223508459482, "frac_reward_zero_std": 1.0, "grad_norm": 0.12788325548171997, "kl": 2.4726991653442383, "learning_rate": 4.2775853537852555e-06, "loss": 0.09916701912879944, "num_tokens": 4596741.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 518, "step_time": 15.570501908492588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.8125, "completions/mean_terminated_length": 10.8125, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.2315227070347284, "frac_reward_zero_std": 1.0, "grad_norm": 3.225735902786255, "kl": 4.514407992362976, "learning_rate": 4.2726373082632364e-06, "loss": 0.17293544113636017, "num_tokens": 4616618.0, "reward": -0.19999999925494194, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": -0.19999999925494194, "rewards/reward_financial_reasoning/std": 0.0, "step": 520, "step_time": 15.201164821501152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.375, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.0, "completions/max_terminated_length": 21.5, "completions/mean_length": 29.375, "completions/mean_terminated_length": 14.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.23241317898486197, "frac_reward_zero_std": 0.75, "grad_norm": 0.38097259402275085, "kl": 2.0567711293697357, "learning_rate": 4.267689262741217e-06, "loss": 0.21855676174163818, "num_tokens": 4637952.0, "reward": 0.07500001043081284, "reward_std": 0.3823606073856354, "rewards/reward_financial_reasoning/mean": 0.07500001043081284, "rewards/reward_financial_reasoning/std": 0.3823606073856354, "step": 522, "step_time": 51.707422207495256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.1875, "completions/clipped_ratio": 0.0625, "completions/max_length": 143.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 28.1875, "completions/mean_terminated_length": 12.964285850524902, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.23330365093499555, "frac_reward_zero_std": 0.75, "grad_norm": 2.2591474056243896, "kl": 2.7854665964841843, "learning_rate": 4.262741217219199e-06, "loss": 0.2408628761768341, "num_tokens": 4655307.0, "reward": 0.0625000074505806, "reward_std": 0.36345769464969635, "rewards/reward_financial_reasoning/mean": 0.0625000074505806, "rewards/reward_financial_reasoning/std": 0.36345772445201874, "step": 524, "step_time": 51.00816127199505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.125, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 14.125, "completions/mean_terminated_length": 14.125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2341941228851291, "frac_reward_zero_std": 1.0, "grad_norm": 0.11695566028356552, "kl": 2.5478688031435013, "learning_rate": 4.25779317169718e-06, "loss": 0.09943416714668274, "num_tokens": 4669117.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 526, "step_time": 11.958828082002583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 23.5, "completions/max_terminated_length": 23.5, "completions/mean_length": 16.5625, "completions/mean_terminated_length": 16.5625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2350845948352627, "frac_reward_zero_std": 0.75, "grad_norm": 0.1739898920059204, "kl": 2.163258582353592, "learning_rate": 4.252845126175161e-06, "loss": 0.07099819928407669, "num_tokens": 4691342.0, "reward": 0.012499995529651642, "reward_std": 0.46944527328014374, "rewards/reward_financial_reasoning/mean": 0.012499995529651642, "rewards/reward_financial_reasoning/std": 0.46944527328014374, "step": 528, "step_time": 19.66749314849585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.23597506678539626, "frac_reward_zero_std": 1.0, "grad_norm": 0.1463579386472702, "kl": 3.313349887728691, "learning_rate": 4.247897080653142e-06, "loss": 0.11747953295707703, "num_tokens": 4713458.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 530, "step_time": 19.892412299501302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.625, "completions/clipped_ratio": 0.0, "completions/max_length": 36.5, "completions/max_terminated_length": 36.5, "completions/mean_length": 22.625, "completions/mean_terminated_length": 22.625, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.23686553873552982, "frac_reward_zero_std": 1.0, "grad_norm": 0.16379092633724213, "kl": 2.420012056827545, "learning_rate": 4.242949035131124e-06, "loss": 0.0907297134399414, "num_tokens": 4735012.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 532, "step_time": 22.5668212334931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.1875, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.0, "completions/max_terminated_length": 33.5, "completions/mean_length": 33.1875, "completions/mean_terminated_length": 18.85714340209961, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2377560106856634, "frac_reward_zero_std": 1.0, "grad_norm": 0.08127009868621826, "kl": 2.7376275807619095, "learning_rate": 4.238000989609105e-06, "loss": 0.0901961699128151, "num_tokens": 4754439.0, "reward": -0.12500000186264515, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.12500000186264515, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 534, "step_time": 49.09290904900263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 120.5, "completions/clipped_ratio": 0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 18.5, "completions/mean_length": 120.5, "completions/mean_terminated_length": 15.300000190734863, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.23864648263579696, "frac_reward_zero_std": 1.0, "grad_norm": 0.09383174031972885, "kl": 1.5632264073938131, "learning_rate": 4.233052944087086e-06, "loss": 0.0377073734998703, "num_tokens": 4783719.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 536, "step_time": 91.08600207899872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.75, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 11.75, "completions/mean_terminated_length": 11.75, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.23953695458593055, "frac_reward_zero_std": 1.0, "grad_norm": 0.38002869486808777, "kl": 3.1995151340961456, "learning_rate": 4.228104898565067e-06, "loss": 0.12680239975452423, "num_tokens": 4806131.0, "reward": -0.04999999701976776, "reward_std": 0.37416571378707886, "rewards/reward_financial_reasoning/mean": -0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.37416571378707886, "step": 538, "step_time": 18.143814373001078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 20.4375, "completions/mean_terminated_length": 20.4375, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.2404274265360641, "frac_reward_zero_std": 1.0, "grad_norm": 0.13875405490398407, "kl": 2.7571463584899902, "learning_rate": 4.2231568530430485e-06, "loss": 0.11056126654148102, "num_tokens": 4820570.0, "reward": 0.17500000447034836, "reward_std": 0.29398737102746964, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2939873933792114, "step": 540, "step_time": 15.814246591500705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 22.25, "completions/mean_terminated_length": 22.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2413178984861977, "frac_reward_zero_std": 1.0, "grad_norm": 0.36984992027282715, "kl": 1.9395490437746048, "learning_rate": 4.2182088075210294e-06, "loss": 0.08099240064620972, "num_tokens": 4834806.0, "reward": 0.17500000074505806, "reward_std": 0.29398736357688904, "rewards/reward_financial_reasoning/mean": 0.17500000074505806, "rewards/reward_financial_reasoning/std": 0.29398736357688904, "step": 542, "step_time": 16.965279011004895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.625, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 13.625, "completions/mean_terminated_length": 13.625, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.24220837043633126, "frac_reward_zero_std": 1.0, "grad_norm": 0.26365983486175537, "kl": 3.2727459371089935, "learning_rate": 4.21326076199901e-06, "loss": 0.13695213198661804, "num_tokens": 4851616.0, "reward": 0.025000005960464478, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": 0.025000005960464478, "rewards/reward_financial_reasoning/std": 0.24053513258695602, "step": 544, "step_time": 14.740081926498533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.9375, "completions/mean_terminated_length": 11.9375, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.24309884238646481, "frac_reward_zero_std": 1.0, "grad_norm": 0.7430917620658875, "kl": 5.095589101314545, "learning_rate": 4.208312716476992e-06, "loss": 0.20536673069000244, "num_tokens": 4876447.0, "reward": -0.1999999973922968, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.1999999973922968, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 546, "step_time": 19.51627654249387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.625, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2439893143365984, "frac_reward_zero_std": 1.0, "grad_norm": 0.12649628520011902, "kl": 2.3244690746068954, "learning_rate": 4.203364670954973e-06, "loss": 0.0855722650885582, "num_tokens": 4897945.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 548, "step_time": 22.503613662498537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 14.5, "completions/mean_length": 42.25, "completions/mean_terminated_length": 11.71428632736206, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.24487978628673196, "frac_reward_zero_std": 0.5, "grad_norm": 8.322593688964844, "kl": 3.0940473526716232, "learning_rate": 4.198416625432954e-06, "loss": 0.2162281721830368, "num_tokens": 4921997.0, "reward": 0.17500000912696123, "reward_std": 0.30916696786880493, "rewards/reward_financial_reasoning/mean": 0.17500000912696123, "rewards/reward_financial_reasoning/std": 0.3091669976711273, "step": 550, "step_time": 87.72647281950412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0625, "completions/max_length": 169.5, "completions/max_terminated_length": 51.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 20.767857551574707, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.24577025823686555, "frac_reward_zero_std": 0.75, "grad_norm": 0.2092423439025879, "kl": 2.4467161744832993, "learning_rate": 4.193468579910935e-06, "loss": 0.209614560008049, "num_tokens": 4944489.0, "reward": 0.21250000968575478, "reward_std": 0.20310094952583313, "rewards/reward_financial_reasoning/mean": 0.21250000968575478, "rewards/reward_financial_reasoning/std": 0.20310097932815552, "step": 552, "step_time": 61.421522411004844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2466607301869991, "frac_reward_zero_std": 1.0, "grad_norm": 0.2507273256778717, "kl": 2.4721511006355286, "learning_rate": 4.188520534388917e-06, "loss": 0.09320840239524841, "num_tokens": 4962209.0, "reward": 0.30000001192092896, "reward_std": 0.5345224589109421, "rewards/reward_financial_reasoning/mean": 0.30000001192092896, "rewards/reward_financial_reasoning/std": 0.5345224738121033, "step": 554, "step_time": 15.359706958995957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.3125, "completions/clipped_ratio": 0.0625, "completions/max_length": 142.0, "completions/max_terminated_length": 22.5, "completions/mean_length": 31.3125, "completions/mean_terminated_length": 16.053571701049805, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2475512021371327, "frac_reward_zero_std": 0.75, "grad_norm": 4.212193012237549, "kl": 3.4973824322223663, "learning_rate": 4.183572488866898e-06, "loss": 0.16043438017368317, "num_tokens": 4975462.0, "reward": 0.4374999888241291, "reward_std": 0.20310094952583313, "rewards/reward_financial_reasoning/mean": 0.4374999888241291, "rewards/reward_financial_reasoning/std": 0.20310096442699432, "step": 556, "step_time": 45.51254690750284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.1875, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 15.5, "completions/mean_length": 41.1875, "completions/mean_terminated_length": 10.500000476837158, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.24844167408726625, "frac_reward_zero_std": 0.75, "grad_norm": 0.3985747694969177, "kl": 3.6526686996221542, "learning_rate": 4.1786244433448796e-06, "loss": 0.23863635957241058, "num_tokens": 4997065.0, "reward": -0.17499999701976776, "reward_std": 0.32802625745534897, "rewards/reward_financial_reasoning/mean": -0.17499999701976776, "rewards/reward_financial_reasoning/std": 0.32802625745534897, "step": 558, "step_time": 84.31422862350155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 23.0, "completions/mean_terminated_length": 23.0, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.2493321460373998, "frac_reward_zero_std": 1.0, "grad_norm": 0.17756827175617218, "kl": 2.134700432419777, "learning_rate": 4.17367639782286e-06, "loss": 0.08119706809520721, "num_tokens": 5019425.0, "reward": 0.1250000037252903, "reward_std": 0.34743958711624146, "rewards/reward_financial_reasoning/mean": 0.1250000037252903, "rewards/reward_financial_reasoning/std": 0.34743958711624146, "step": 560, "step_time": 21.187141643997165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.875, "completions/clipped_ratio": 0.1875, "completions/max_length": 147.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 64.875, "completions/mean_terminated_length": 19.050000190734863, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.25022261798753337, "frac_reward_zero_std": 1.0, "grad_norm": 0.06384090334177017, "kl": 33.90071538090706, "learning_rate": 4.1687283523008415e-06, "loss": 0.38022756576538086, "num_tokens": 5042127.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 562, "step_time": 54.73333459249625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.875, "completions/clipped_ratio": 0.125, "completions/max_length": 154.5, "completions/max_terminated_length": 32.0, "completions/mean_length": 56.875, "completions/mean_terminated_length": 26.166666984558105, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.25111308993766696, "frac_reward_zero_std": 0.75, "grad_norm": 0.06289020925760269, "kl": 2.710405856370926, "learning_rate": 4.163780306778823e-06, "loss": 0.245255246758461, "num_tokens": 5058741.0, "reward": 0.125, "reward_std": 0.2314550280570984, "rewards/reward_financial_reasoning/mean": 0.125, "rewards/reward_financial_reasoning/std": 0.2314550280570984, "step": 564, "step_time": 51.34954524199202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.75, "completions/clipped_ratio": 0.0625, "completions/max_length": 144.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 30.75, "completions/mean_terminated_length": 15.437500476837158, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.25200356188780054, "frac_reward_zero_std": 0.75, "grad_norm": 4.2839765548706055, "kl": 3.1892134696245193, "learning_rate": 4.158832261256804e-06, "loss": 0.1574995219707489, "num_tokens": 5073553.0, "reward": 0.6375000327825546, "reward_std": 0.22243820875883102, "rewards/reward_financial_reasoning/mean": 0.6375000327825546, "rewards/reward_financial_reasoning/std": 0.22243822365999222, "step": 566, "step_time": 46.588499311001215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 71.375, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 71.375, "completions/mean_terminated_length": 29.416667938232422, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.25289403383793413, "frac_reward_zero_std": 0.75, "grad_norm": 3.5852010250091553, "kl": 1.5079333148896694, "learning_rate": 4.153884215734785e-06, "loss": -0.010437268763780594, "num_tokens": 5094175.0, "reward": -0.16250000149011612, "reward_std": 0.3156214952468872, "rewards/reward_financial_reasoning/mean": -0.16250000149011612, "rewards/reward_financial_reasoning/std": 0.3156214952468872, "step": 568, "step_time": 82.4080735499956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 36.5, "completions/max_terminated_length": 36.5, "completions/mean_length": 16.9375, "completions/mean_terminated_length": 16.9375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.25378450578806766, "frac_reward_zero_std": 0.75, "grad_norm": 7.807444095611572, "kl": 3.8660908937454224, "learning_rate": 4.148936170212766e-06, "loss": 0.19987696409225464, "num_tokens": 5113566.0, "reward": -0.012499988079071045, "reward_std": 0.28327932208776474, "rewards/reward_financial_reasoning/mean": -0.012499988079071045, "rewards/reward_financial_reasoning/std": 0.2832793518900871, "step": 570, "step_time": 21.036308422495495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.5, "completions/clipped_ratio": 0.0625, "completions/max_length": 141.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 26.5, "completions/mean_terminated_length": 11.017857551574707, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.25467497773820125, "frac_reward_zero_std": 1.0, "grad_norm": 0.8640673756599426, "kl": 4.032097265124321, "learning_rate": 4.143988124690748e-06, "loss": 0.14955981075763702, "num_tokens": 5130134.0, "reward": 0.05000000447034836, "reward_std": 0.42761798202991486, "rewards/reward_financial_reasoning/mean": 0.05000000447034836, "rewards/reward_financial_reasoning/std": 0.42761798202991486, "step": 572, "step_time": 47.026896394996584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.25, "completions/clipped_ratio": 0.0, "completions/max_length": 97.5, "completions/max_terminated_length": 97.5, "completions/mean_length": 32.25, "completions/mean_terminated_length": 32.25, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.25556544968833483, "frac_reward_zero_std": 1.0, "grad_norm": 0.19692854583263397, "kl": 2.4768466502428055, "learning_rate": 4.139040079168729e-06, "loss": 0.07914983481168747, "num_tokens": 5147642.0, "reward": 0.025000005960464478, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": 0.025000005960464478, "rewards/reward_financial_reasoning/std": 0.24053513258695602, "step": 574, "step_time": 36.08788107399232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.125, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 30.125, "completions/mean_terminated_length": 30.125, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.25645592163846836, "frac_reward_zero_std": 1.0, "grad_norm": 1.0939751863479614, "kl": 2.487314283847809, "learning_rate": 4.13409203364671e-06, "loss": 0.09564441442489624, "num_tokens": 5163948.0, "reward": 0.17500000074505806, "reward_std": 0.29398736357688904, "rewards/reward_financial_reasoning/mean": 0.17500000074505806, "rewards/reward_financial_reasoning/std": 0.29398736357688904, "step": 576, "step_time": 20.96104118149742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 9.6875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.25734639358860195, "frac_reward_zero_std": 1.0, "grad_norm": 1.4339823722839355, "kl": 4.345266610383987, "learning_rate": 4.129143988124691e-06, "loss": 0.15623024106025696, "num_tokens": 5178567.0, "reward": 0.5249999910593033, "reward_std": 0.40089183300733566, "rewards/reward_financial_reasoning/mean": 0.5249999910593033, "rewards/reward_financial_reasoning/std": 0.40089183300733566, "step": 578, "step_time": 13.346558070505125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.875, "completions/clipped_ratio": 0.0625, "completions/max_length": 153.0, "completions/max_terminated_length": 48.5, "completions/mean_length": 43.875, "completions/mean_terminated_length": 29.750000953674316, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.25823686553873554, "frac_reward_zero_std": 1.0, "grad_norm": 0.06241405010223389, "kl": 2.072991468012333, "learning_rate": 4.1241959426026726e-06, "loss": 0.07811924070119858, "num_tokens": 5197021.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 580, "step_time": 50.97594972850857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.5, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 16.57142925262451, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2591273374888691, "frac_reward_zero_std": 0.5, "grad_norm": 3.204791784286499, "kl": 3.2653029412031174, "learning_rate": 4.1192478970806535e-06, "loss": 0.18918476998806, "num_tokens": 5216461.0, "reward": 0.22500000894069672, "reward_std": 0.4830881953239441, "rewards/reward_financial_reasoning/mean": 0.22500000894069672, "rewards/reward_financial_reasoning/std": 0.4830882251262665, "step": 582, "step_time": 83.53283959699183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 31.5, "completions/max_terminated_length": 31.5, "completions/mean_length": 22.6875, "completions/mean_terminated_length": 22.6875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.26001780943900266, "frac_reward_zero_std": 0.75, "grad_norm": 0.1705213189125061, "kl": 2.3984710425138474, "learning_rate": 4.1142998515586345e-06, "loss": 0.10159868746995926, "num_tokens": 5237608.0, "reward": -0.03750000521540642, "reward_std": 0.1505940556526184, "rewards/reward_financial_reasoning/mean": -0.03750000521540642, "rewards/reward_financial_reasoning/std": 0.1505940705537796, "step": 584, "step_time": 20.515003172498837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 68.6875, "completions/clipped_ratio": 0.25, "completions/max_length": 134.0, "completions/max_terminated_length": 8.5, "completions/mean_length": 68.6875, "completions/mean_terminated_length": 5.9375, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.26090828138913624, "frac_reward_zero_std": 1.0, "grad_norm": 1.3467575311660767, "kl": 4.0492381900548935, "learning_rate": 4.109351806036616e-06, "loss": 0.15455511212348938, "num_tokens": 5252235.0, "reward": 0.22500000894069672, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.22500000894069672, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 586, "step_time": 42.47935264099942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 14.4375, "completions/mean_terminated_length": 14.4375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.26179875333926983, "frac_reward_zero_std": 1.0, "grad_norm": 0.4897940158843994, "kl": 2.533974751830101, "learning_rate": 4.104403760514597e-06, "loss": 0.10408926755189896, "num_tokens": 5272538.0, "reward": 0.17500000447034836, "reward_std": 0.29398737102746964, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2939873933792114, "step": 588, "step_time": 16.557259472498117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.625, "completions/clipped_ratio": 0.25, "completions/max_length": 141.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 74.625, "completions/mean_terminated_length": 11.125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.26268922528940336, "frac_reward_zero_std": 0.75, "grad_norm": 0.31131821870803833, "kl": 4.341089241206646, "learning_rate": 4.099455714992578e-06, "loss": 0.05962072312831879, "num_tokens": 5287316.0, "reward": 0.4374999888241291, "reward_std": 0.20310094952583313, "rewards/reward_financial_reasoning/mean": 0.4374999888241291, "rewards/reward_financial_reasoning/std": 0.20310096442699432, "step": 590, "step_time": 44.81954843049607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 77.6875, "completions/clipped_ratio": 0.25, "completions/max_length": 146.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 77.6875, "completions/mean_terminated_length": 15.1875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.26357969723953695, "frac_reward_zero_std": 1.0, "grad_norm": 0.08828865736722946, "kl": 2.6844928599894047, "learning_rate": 4.094507669470559e-06, "loss": 0.08801240473985672, "num_tokens": 5299063.0, "reward": 0.3500000014901161, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": 0.3500000014901161, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 592, "step_time": 44.574043756492756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.375, "completions/clipped_ratio": 0.125, "completions/max_length": 139.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 46.375, "completions/mean_terminated_length": 16.58333396911621, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.26447016918967053, "frac_reward_zero_std": 0.75, "grad_norm": 0.9881147146224976, "kl": 2.950136587023735, "learning_rate": 4.089559623948541e-06, "loss": 0.26590752601623535, "num_tokens": 5319749.0, "reward": 0.15000000223517418, "reward_std": 0.1963960975408554, "rewards/reward_financial_reasoning/mean": 0.15000000223517418, "rewards/reward_financial_reasoning/std": 0.1963961124420166, "step": 594, "step_time": 49.72403268000926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.2653606411398041, "frac_reward_zero_std": 0.75, "grad_norm": 5.235046863555908, "kl": 2.279493637382984, "learning_rate": 4.084611578426522e-06, "loss": 0.08282871544361115, "num_tokens": 5335481.0, "reward": 0.1250000037252903, "reward_std": 0.5438356846570969, "rewards/reward_financial_reasoning/mean": 0.1250000037252903, "rewards/reward_financial_reasoning/std": 0.5438356846570969, "step": 596, "step_time": 14.50181246100692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.375, "completions/clipped_ratio": 0.0625, "completions/max_length": 135.0, "completions/max_terminated_length": 30.5, "completions/mean_length": 37.375, "completions/mean_terminated_length": 23.491071701049805, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.26625111308993765, "frac_reward_zero_std": 0.75, "grad_norm": 0.7327852249145508, "kl": 2.17822552472353, "learning_rate": 4.079663532904503e-06, "loss": 0.14887002110481262, "num_tokens": 5351039.0, "reward": 0.0625000074505806, "reward_std": 0.36345769464969635, "rewards/reward_financial_reasoning/mean": 0.0625000074505806, "rewards/reward_financial_reasoning/std": 0.36345772445201874, "step": 598, "step_time": 43.72957969250274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.625, "completions/clipped_ratio": 0.0, "completions/max_length": 35.5, "completions/max_terminated_length": 35.5, "completions/mean_length": 24.625, "completions/mean_terminated_length": 24.625, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.26714158504007124, "frac_reward_zero_std": 1.0, "grad_norm": 0.892500102519989, "kl": 2.6852394491434097, "learning_rate": 4.074715487382484e-06, "loss": 0.10773130506277084, "num_tokens": 5371433.0, "reward": 0.07499999552965164, "reward_std": 0.40089186280965805, "rewards/reward_financial_reasoning/mean": 0.07499999552965164, "rewards/reward_financial_reasoning/std": 0.40089187026023865, "step": 600, "step_time": 21.074333743999887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 15.1875, "completions/mean_terminated_length": 15.1875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.2680320569902048, "frac_reward_zero_std": 1.0, "grad_norm": 0.12110089510679245, "kl": 5.3096261620521545, "learning_rate": 4.0697674418604655e-06, "loss": 0.17482957243919373, "num_tokens": 5382116.0, "reward": 0.424999987706542, "reward_std": 0.13363061845302582, "rewards/reward_financial_reasoning/mean": 0.424999987706542, "rewards/reward_financial_reasoning/std": 0.13363061845302582, "step": 602, "step_time": 15.877465633999236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.26892252894033836, "frac_reward_zero_std": 1.0, "grad_norm": 0.11750569939613342, "kl": 2.0825249701738358, "learning_rate": 4.0648193963384465e-06, "loss": 0.0821978822350502, "num_tokens": 5400944.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 604, "step_time": 16.235258598000655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.26981300089047194, "frac_reward_zero_std": 1.0, "grad_norm": 0.39543071389198303, "kl": 2.9878444522619247, "learning_rate": 4.0598713508164274e-06, "loss": 0.12423422187566757, "num_tokens": 5417336.0, "reward": 0.17500000447034836, "reward_std": 0.29398737102746964, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2939873933792114, "step": 606, "step_time": 14.500998946492473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.625, "completions/clipped_ratio": 0.125, "completions/max_length": 138.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 45.625, "completions/mean_terminated_length": 15.708333969116211, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.27070347284060553, "frac_reward_zero_std": 1.0, "grad_norm": 0.09268730878829956, "kl": 3.4857796132564545, "learning_rate": 4.054923305294409e-06, "loss": 0.0689154639840126, "num_tokens": 5432242.0, "reward": 0.4999999888241291, "reward_std": 0.05345224589109421, "rewards/reward_financial_reasoning/mean": 0.4999999888241291, "rewards/reward_financial_reasoning/std": 0.05345224589109421, "step": 608, "step_time": 44.237581209497876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.1875, "completions/clipped_ratio": 0.0625, "completions/max_length": 158.5, "completions/max_terminated_length": 42.5, "completions/mean_length": 41.1875, "completions/mean_terminated_length": 26.25892925262451, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.2715939447907391, "frac_reward_zero_std": 1.0, "grad_norm": 0.2548654079437256, "kl": 1.9635280668735504, "learning_rate": 4.04997525977239e-06, "loss": 0.0742587149143219, "num_tokens": 5449301.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 610, "step_time": 53.4047240209984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 15.3125, "completions/mean_terminated_length": 15.3125, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.27248441674087265, "frac_reward_zero_std": 0.75, "grad_norm": 0.25754010677337646, "kl": 2.283314034342766, "learning_rate": 4.045027214250372e-06, "loss": 0.10037024319171906, "num_tokens": 5471570.0, "reward": 0.2500000074505806, "reward_std": 0.2577935457229614, "rewards/reward_financial_reasoning/mean": 0.2500000074505806, "rewards/reward_financial_reasoning/std": 0.2577935680747032, "step": 612, "step_time": 17.858794250994833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 28.0625, "completions/mean_terminated_length": 28.0625, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.27337488869100623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0666544958949089, "kl": 2.3905524760484695, "learning_rate": 4.040079168728352e-06, "loss": 0.09394139051437378, "num_tokens": 5489603.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 614, "step_time": 23.732076187996427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 35.5, "completions/max_terminated_length": 35.5, "completions/mean_length": 22.0625, "completions/mean_terminated_length": 22.0625, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.2742653606411398, "frac_reward_zero_std": 1.0, "grad_norm": 0.4475157558917999, "kl": 2.87551811337471, "learning_rate": 4.035131123206334e-06, "loss": 0.10836954414844513, "num_tokens": 5506588.0, "reward": 0.17500000074505806, "reward_std": 0.29398736357688904, "rewards/reward_financial_reasoning/mean": 0.17500000074505806, "rewards/reward_financial_reasoning/std": 0.29398736357688904, "step": 616, "step_time": 18.957288730998698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 20.1875, "completions/mean_terminated_length": 20.1875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.27515583259127335, "frac_reward_zero_std": 0.75, "grad_norm": 0.8842321634292603, "kl": 2.9271684885025024, "learning_rate": 4.030183077684315e-06, "loss": 0.11716306954622269, "num_tokens": 5526535.0, "reward": 0.21250000968575478, "reward_std": 0.20310094952583313, "rewards/reward_financial_reasoning/mean": 0.21250000968575478, "rewards/reward_financial_reasoning/std": 0.20310097932815552, "step": 618, "step_time": 17.811359782997897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.5, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.27604630454140694, "frac_reward_zero_std": 1.0, "grad_norm": 1.110678791999817, "kl": 2.3337113112211227, "learning_rate": 4.025235032162297e-06, "loss": 0.0931580662727356, "num_tokens": 5551335.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 620, "step_time": 24.730726426994806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.9375, "completions/clipped_ratio": 0.0625, "completions/max_length": 135.5, "completions/max_terminated_length": 11.0, "completions/mean_length": 23.9375, "completions/mean_terminated_length": 8.321428775787354, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.2769367764915405, "frac_reward_zero_std": 0.75, "grad_norm": 0.3603140413761139, "kl": 2.8234359323978424, "learning_rate": 4.020286986640278e-06, "loss": 0.06738406419754028, "num_tokens": 5560406.0, "reward": 0.4125000089406967, "reward_std": 0.5242162793874741, "rewards/reward_financial_reasoning/mean": 0.4125000089406967, "rewards/reward_financial_reasoning/std": 0.5242162793874741, "step": 622, "step_time": 40.600682643504115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.0, "completions/max_terminated_length": 16.5, "completions/mean_length": 28.0, "completions/mean_terminated_length": 12.678571701049805, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.2778272484416741, "frac_reward_zero_std": 0.75, "grad_norm": 11.575019836425781, "kl": 2.758111670613289, "learning_rate": 4.0153389411182585e-06, "loss": 0.10476522147655487, "num_tokens": 5579686.0, "reward": -0.0875000013038516, "reward_std": 0.266422763466835, "rewards/reward_financial_reasoning/mean": -0.0875000013038516, "rewards/reward_financial_reasoning/std": 0.266422763466835, "step": 624, "step_time": 48.948475874003634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.375, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 14.375, "completions/mean_terminated_length": 14.375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.27871772039180764, "frac_reward_zero_std": 0.75, "grad_norm": 0.24415989220142365, "kl": 3.251735508441925, "learning_rate": 4.01039089559624e-06, "loss": 0.10590354353189468, "num_tokens": 5600260.0, "reward": 0.11250000447034836, "reward_std": 0.511647641658783, "rewards/reward_financial_reasoning/mean": 0.11250000447034836, "rewards/reward_financial_reasoning/std": 0.5116476565599442, "step": 626, "step_time": 17.442232475510536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.75, "completions/clipped_ratio": 0.0625, "completions/max_length": 137.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 29.75, "completions/mean_terminated_length": 14.464285850524902, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.27960819234194123, "frac_reward_zero_std": 1.0, "grad_norm": 0.1346379518508911, "kl": 2.713219091296196, "learning_rate": 4.005442850074221e-06, "loss": 0.08727573603391647, "num_tokens": 5614192.0, "reward": 0.4999999888241291, "reward_std": 0.05345224589109421, "rewards/reward_financial_reasoning/mean": 0.4999999888241291, "rewards/reward_financial_reasoning/std": 0.05345224589109421, "step": 628, "step_time": 44.28616671049895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.5, "completions/clipped_ratio": 0.0625, "completions/max_length": 132.5, "completions/max_terminated_length": 45.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 12.625, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.2804986642920748, "frac_reward_zero_std": 0.75, "grad_norm": 0.4055415093898773, "kl": 4.209835857152939, "learning_rate": 4.000494804552202e-06, "loss": 0.17613010108470917, "num_tokens": 5626128.0, "reward": 0.3999999910593033, "reward_std": 0.6133611500263214, "rewards/reward_financial_reasoning/mean": 0.3999999910593033, "rewards/reward_financial_reasoning/std": 0.6133611798286438, "step": 630, "step_time": 41.569186088505376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.25, "completions/clipped_ratio": 0.0625, "completions/max_length": 139.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 29.25, "completions/mean_terminated_length": 14.196428775787354, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.28138913624220835, "frac_reward_zero_std": 1.0, "grad_norm": 0.3633623719215393, "kl": 2.2347910553216934, "learning_rate": 3.995546759030183e-06, "loss": 0.07843896746635437, "num_tokens": 5644300.0, "reward": 0.1250000074505806, "reward_std": 0.34743961691856384, "rewards/reward_financial_reasoning/mean": 0.1250000074505806, "rewards/reward_financial_reasoning/std": 0.34743963181972504, "step": 632, "step_time": 49.13458822750181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 12.8125, "completions/mean_terminated_length": 12.8125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.28227960819234194, "frac_reward_zero_std": 1.0, "grad_norm": 0.4399714767932892, "kl": 2.800263747572899, "learning_rate": 3.990598713508165e-06, "loss": 0.11733436584472656, "num_tokens": 5663081.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 634, "step_time": 15.632818516496627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.5, "completions/clipped_ratio": 0.0625, "completions/max_length": 171.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 36.125000953674316, "completions/min_length": 20.5, "completions/min_terminated_length": 20.5, "epoch": 0.2831700801424755, "frac_reward_zero_std": 0.5, "grad_norm": 10.8825044631958, "kl": 2.876441642642021, "learning_rate": 3.985650667986146e-06, "loss": 0.23618704080581665, "num_tokens": 5688585.0, "reward": 0.012499998323619366, "reward_std": 0.19864802062511444, "rewards/reward_financial_reasoning/mean": 0.012499998323619366, "rewards/reward_financial_reasoning/std": 0.19864803552627563, "step": 636, "step_time": 62.42338538400145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 55.25, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 57.5, "completions/mean_length": 55.25, "completions/mean_terminated_length": 26.57142972946167, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2840605520926091, "frac_reward_zero_std": 0.5, "grad_norm": 1.063951849937439, "kl": 2.164351548999548, "learning_rate": 3.980702622464127e-06, "loss": 0.2528214454650879, "num_tokens": 5705493.0, "reward": 0.32500001788139343, "reward_std": 0.40620189905166626, "rewards/reward_financial_reasoning/mean": 0.32500001788139343, "rewards/reward_financial_reasoning/std": 0.40620195865631104, "step": 638, "step_time": 79.79323933250271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0625, "completions/max_length": 134.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 17.035715103149414, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.28495102404274264, "frac_reward_zero_std": 1.0, "grad_norm": 0.08490260690450668, "kl": 2.701087385416031, "learning_rate": 3.975754576942108e-06, "loss": 0.08981408923864365, "num_tokens": 5724549.0, "reward": -0.12500000186264515, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.12500000186264515, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 640, "step_time": 48.10033849150204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5625, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.5, "completions/max_terminated_length": 40.5, "completions/mean_length": 31.5625, "completions/mean_terminated_length": 16.678571701049805, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2858414959928762, "frac_reward_zero_std": 0.75, "grad_norm": 11.19996166229248, "kl": 3.914921998977661, "learning_rate": 3.97080653142009e-06, "loss": 0.10081885010004044, "num_tokens": 5743766.0, "reward": 0.38750001788139343, "reward_std": 0.39018382132053375, "rewards/reward_financial_reasoning/mean": 0.38750001788139343, "rewards/reward_financial_reasoning/std": 0.39018386602401733, "step": 642, "step_time": 47.918799355502415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 37.5, "completions/max_terminated_length": 37.5, "completions/mean_length": 28.3125, "completions/mean_terminated_length": 28.3125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2867319679430098, "frac_reward_zero_std": 1.0, "grad_norm": 0.07037749886512756, "kl": 2.5467402040958405, "learning_rate": 3.9658584858980706e-06, "loss": 0.10022091865539551, "num_tokens": 5764507.0, "reward": -0.10000000521540642, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": -0.10000000521540642, "rewards/reward_financial_reasoning/std": 0.0, "step": 644, "step_time": 21.87775604199851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 57.5, "completions/max_terminated_length": 57.5, "completions/mean_length": 22.3125, "completions/mean_terminated_length": 22.3125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.28762243989314334, "frac_reward_zero_std": 1.0, "grad_norm": 0.3838065266609192, "kl": 34.147406324744225, "learning_rate": 3.9609104403760515e-06, "loss": 1.4865283966064453, "num_tokens": 5780000.0, "reward": -0.025000005960464478, "reward_std": 0.29398736357688904, "rewards/reward_financial_reasoning/mean": -0.025000005960464478, "rewards/reward_financial_reasoning/std": 0.29398736357688904, "step": 646, "step_time": 23.692439714999637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 34.5, "completions/max_terminated_length": 34.5, "completions/mean_length": 13.5625, "completions/mean_terminated_length": 13.5625, "completions/min_length": 4.5, "completions/min_terminated_length": 4.5, "epoch": 0.28851291184327693, "frac_reward_zero_std": 0.75, "grad_norm": 0.09616874158382416, "kl": 4.008978515863419, "learning_rate": 3.955962394854033e-06, "loss": 0.13894228637218475, "num_tokens": 5797961.0, "reward": -0.14999999850988388, "reward_std": 0.4457136541604996, "rewards/reward_financial_reasoning/mean": -0.14999999850988388, "rewards/reward_financial_reasoning/std": 0.4457136541604996, "step": 648, "step_time": 19.492217393992178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.5, "completions/clipped_ratio": 0.0, "completions/max_length": 33.5, "completions/max_terminated_length": 33.5, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2894033837934105, "frac_reward_zero_std": 0.75, "grad_norm": 0.10026784986257553, "kl": 2.471968561410904, "learning_rate": 3.951014349332014e-06, "loss": 0.04348517209291458, "num_tokens": 5812921.0, "reward": -0.0624999962747097, "reward_std": 0.219983771443367, "rewards/reward_financial_reasoning/mean": -0.0624999962747097, "rewards/reward_financial_reasoning/std": 0.219983771443367, "step": 650, "step_time": 17.18861940100396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.125, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.5, "completions/max_terminated_length": 36.5, "completions/mean_length": 34.125, "completions/mean_terminated_length": 19.44642925262451, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.2902938557435441, "frac_reward_zero_std": 0.5, "grad_norm": 4.763707160949707, "kl": 250.4700199663639, "learning_rate": 3.946066303809995e-06, "loss": 15.857925415039062, "num_tokens": 5833435.0, "reward": 0.17500000912696123, "reward_std": 0.30916696786880493, "rewards/reward_financial_reasoning/mean": 0.17500000912696123, "rewards/reward_financial_reasoning/std": 0.3091669976711273, "step": 652, "step_time": 50.153412438994565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 60.4375, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 18.5, "completions/mean_length": 60.4375, "completions/mean_terminated_length": 15.488095760345459, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.29118432769367764, "frac_reward_zero_std": 0.5, "grad_norm": 2.0384180545806885, "kl": 1.534380428493023, "learning_rate": 3.941118258287976e-06, "loss": 0.04604524374008179, "num_tokens": 5850522.0, "reward": 0.21250001154839993, "reward_std": 0.3419739603996277, "rewards/reward_financial_reasoning/mean": 0.21250001154839993, "rewards/reward_financial_reasoning/std": 0.3419739902019501, "step": 654, "step_time": 79.99165332850316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 46.5, "completions/max_terminated_length": 46.5, "completions/mean_length": 22.5625, "completions/mean_terminated_length": 22.5625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2920747996438112, "frac_reward_zero_std": 1.0, "grad_norm": 0.6213010549545288, "kl": 3.3993609100580215, "learning_rate": 3.936170212765958e-06, "loss": 0.12003728747367859, "num_tokens": 5867427.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 656, "step_time": 22.32193847149756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 28.0625, "completions/mean_terminated_length": 28.0625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2929652715939448, "frac_reward_zero_std": 1.0, "grad_norm": 0.232942596077919, "kl": 2.3799022883176804, "learning_rate": 3.931222167243939e-06, "loss": 0.0874049961566925, "num_tokens": 5885148.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 658, "step_time": 23.394534400998964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 15.4375, "completions/mean_terminated_length": 15.4375, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.29385574354407834, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328476071357727, "kl": 2.801008328795433, "learning_rate": 3.92627412172192e-06, "loss": 0.10386461019515991, "num_tokens": 5904787.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 660, "step_time": 16.91145214050266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 11.3125, "completions/mean_terminated_length": 11.3125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2947462154942119, "frac_reward_zero_std": 1.0, "grad_norm": 7.384799957275391, "kl": 4.985250949859619, "learning_rate": 3.921326076199901e-06, "loss": 0.18682938814163208, "num_tokens": 5917672.0, "reward": 0.6249999850988388, "reward_std": 0.40089183300733566, "rewards/reward_financial_reasoning/mean": 0.6249999850988388, "rewards/reward_financial_reasoning/std": 0.40089183300733566, "step": 662, "step_time": 12.575487746493309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.0625, "completions/clipped_ratio": 0.0625, "completions/max_length": 149.0, "completions/max_terminated_length": 31.5, "completions/mean_length": 35.0625, "completions/mean_terminated_length": 20.267857551574707, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2956366874443455, "frac_reward_zero_std": 0.75, "grad_norm": 1.0087742805480957, "kl": 2.6491604149341583, "learning_rate": 3.916378030677883e-06, "loss": 0.21765044331550598, "num_tokens": 5940305.0, "reward": -0.03750000428408384, "reward_std": 0.21297051757574081, "rewards/reward_financial_reasoning/mean": -0.03750000428408384, "rewards/reward_financial_reasoning/std": 0.2129705250263214, "step": 664, "step_time": 54.70537975649859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 161.0, "completions/max_terminated_length": 56.5, "completions/mean_length": 40.4375, "completions/mean_terminated_length": 25.803571701049805, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.2965271593944791, "frac_reward_zero_std": 0.75, "grad_norm": 0.06418345123529434, "kl": 3.329156816005707, "learning_rate": 3.911429985155864e-06, "loss": 0.1880941092967987, "num_tokens": 5957008.0, "reward": 0.06250000093132257, "reward_std": 0.1060660183429718, "rewards/reward_financial_reasoning/mean": 0.06250000093132257, "rewards/reward_financial_reasoning/std": 0.1060660183429718, "step": 666, "step_time": 53.807816513504804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.29741763134461263, "frac_reward_zero_std": 1.0, "grad_norm": 0.1388830840587616, "kl": 2.4952100068330765, "learning_rate": 3.9064819396338445e-06, "loss": 0.09984306991100311, "num_tokens": 5979428.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 668, "step_time": 19.566764060000423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.5, "completions/clipped_ratio": 0.0, "completions/max_length": 18.5, "completions/max_terminated_length": 18.5, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.2983081032947462, "frac_reward_zero_std": 1.0, "grad_norm": 0.17398419976234436, "kl": 3.8046407103538513, "learning_rate": 3.901533894111826e-06, "loss": 0.14238014817237854, "num_tokens": 5994700.0, "reward": 0.02499999850988388, "reward_std": 0.45434410870075226, "rewards/reward_financial_reasoning/mean": 0.02499999850988388, "rewards/reward_financial_reasoning/std": 0.45434410870075226, "step": 670, "step_time": 13.631318826504867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 39.9375, "completions/clipped_ratio": 0.0625, "completions/max_length": 162.5, "completions/max_terminated_length": 46.5, "completions/mean_length": 39.9375, "completions/mean_terminated_length": 25.142857551574707, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.2991985752448798, "frac_reward_zero_std": 0.75, "grad_norm": 20.34697723388672, "kl": 7.88471856713295, "learning_rate": 3.896585848589807e-06, "loss": 0.20272746682167053, "num_tokens": 6014331.0, "reward": -1.862645149230957e-09, "reward_std": 0.1508890464901924, "rewards/reward_financial_reasoning/mean": -1.862645149230957e-09, "rewards/reward_financial_reasoning/std": 0.150889053940773, "step": 672, "step_time": 57.11428980049823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 65.875, "completions/clipped_ratio": 0.1875, "completions/max_length": 139.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 65.875, "completions/mean_terminated_length": 25.225000381469727, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.30008904719501334, "frac_reward_zero_std": 0.75, "grad_norm": 0.6960481405258179, "kl": 3.615657825022936, "learning_rate": 3.891637803067789e-06, "loss": 0.07131641358137131, "num_tokens": 6026945.0, "reward": 0.4624999910593033, "reward_std": 0.6227896511554718, "rewards/reward_financial_reasoning/mean": 0.4624999910593033, "rewards/reward_financial_reasoning/std": 0.6227896213531494, "step": 674, "step_time": 43.584765961499215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.75, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 25.75, "completions/mean_terminated_length": 25.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3009795191451469, "frac_reward_zero_std": 0.75, "grad_norm": 0.05800027772784233, "kl": 2.021700158715248, "learning_rate": 3.88668975754577e-06, "loss": 0.04579779505729675, "num_tokens": 6042589.0, "reward": 0.38750001788139343, "reward_std": 0.39018382132053375, "rewards/reward_financial_reasoning/mean": 0.38750001788139343, "rewards/reward_financial_reasoning/std": 0.39018386602401733, "step": 676, "step_time": 21.839032033509284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 13.6875, "completions/mean_terminated_length": 13.6875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.3018699910952805, "frac_reward_zero_std": 1.0, "grad_norm": 0.451972097158432, "kl": 2.8006647378206253, "learning_rate": 3.881741712023751e-06, "loss": 0.09243463724851608, "num_tokens": 6059048.0, "reward": 0.45000001788139343, "reward_std": 0.37416574358940125, "rewards/reward_financial_reasoning/mean": 0.45000001788139343, "rewards/reward_financial_reasoning/std": 0.37416577339172363, "step": 678, "step_time": 15.941890390007757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.125, "completions/clipped_ratio": 0.0, "completions/max_length": 93.5, "completions/max_terminated_length": 93.5, "completions/mean_length": 22.125, "completions/mean_terminated_length": 22.125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3027604630454141, "frac_reward_zero_std": 0.75, "grad_norm": 4.601978778839111, "kl": 3.8233794271945953, "learning_rate": 3.876793666501732e-06, "loss": 0.1953509896993637, "num_tokens": 6070754.0, "reward": 0.08750000596046448, "reward_std": 0.25599944591522217, "rewards/reward_financial_reasoning/mean": 0.08750000596046448, "rewards/reward_financial_reasoning/std": 0.25599944591522217, "step": 680, "step_time": 31.333621428992046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 40.5, "completions/max_terminated_length": 40.5, "completions/mean_length": 28.1875, "completions/mean_terminated_length": 28.1875, "completions/min_length": 20.5, "completions/min_terminated_length": 20.5, "epoch": 0.30365093499554763, "frac_reward_zero_std": 1.0, "grad_norm": 0.2807849049568176, "kl": 2.814168304204941, "learning_rate": 3.871845620979714e-06, "loss": 0.11746909469366074, "num_tokens": 6088965.0, "reward": 0.07500000298023224, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.07500000298023224, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 682, "step_time": 21.245715715005645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5, "completions/clipped_ratio": 0.0, "completions/max_length": 19.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.3045414069456812, "frac_reward_zero_std": 1.0, "grad_norm": 0.09957019239664078, "kl": 3.1638737618923187, "learning_rate": 3.866897575457695e-06, "loss": 0.11611393094062805, "num_tokens": 6105413.0, "reward": 0.17500000447034836, "reward_std": 0.29398737102746964, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2939873933792114, "step": 684, "step_time": 14.758325781003805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.75, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 33.75, "completions/mean_terminated_length": 33.75, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.3054318788958148, "frac_reward_zero_std": 1.0, "grad_norm": 0.048933424055576324, "kl": 2.0513499826192856, "learning_rate": 3.861949529935676e-06, "loss": 0.0797392949461937, "num_tokens": 6122497.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 686, "step_time": 23.42249280249962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 19.6875, "completions/mean_terminated_length": 19.6875, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.30632235084594833, "frac_reward_zero_std": 1.0, "grad_norm": 0.07173699885606766, "kl": 2.6596494019031525, "learning_rate": 3.857001484413657e-06, "loss": 0.1044078841805458, "num_tokens": 6135284.0, "reward": 0.2750000059604645, "reward_std": 0.5612486004829407, "rewards/reward_financial_reasoning/mean": 0.2750000059604645, "rewards/reward_financial_reasoning/std": 0.5612486004829407, "step": 688, "step_time": 13.520752940003149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 24.3125, "completions/mean_terminated_length": 24.3125, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.3072128227960819, "frac_reward_zero_std": 0.75, "grad_norm": 0.16115054488182068, "kl": 3.1704325079917908, "learning_rate": 3.852053438891638e-06, "loss": 0.0991721972823143, "num_tokens": 6152249.0, "reward": -0.01249999925494194, "reward_std": 0.12464234232902527, "rewards/reward_financial_reasoning/mean": -0.01249999925494194, "rewards/reward_financial_reasoning/std": 0.12464234232902527, "step": 690, "step_time": 17.69266291499298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 55.9375, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 55.9375, "completions/mean_terminated_length": 9.869048118591309, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.3081032947462155, "frac_reward_zero_std": 0.75, "grad_norm": 2.471900463104248, "kl": 2.3048749417066574, "learning_rate": 3.847105393369619e-06, "loss": -0.026355061680078506, "num_tokens": 6168208.0, "reward": 0.11250000447034836, "reward_std": 0.511647641658783, "rewards/reward_financial_reasoning/mean": 0.11250000447034836, "rewards/reward_financial_reasoning/std": 0.5116476565599442, "step": 692, "step_time": 78.48041973699947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.6875, "completions/clipped_ratio": 0.125, "completions/max_length": 174.5, "completions/max_terminated_length": 50.5, "completions/mean_length": 48.6875, "completions/mean_terminated_length": 17.4375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3089937666963491, "frac_reward_zero_std": 0.75, "grad_norm": 0.3391161859035492, "kl": 2.7371310964226723, "learning_rate": 3.8421573478476e-06, "loss": 0.17964963614940643, "num_tokens": 6181139.0, "reward": 0.2500000074505806, "reward_std": 0.46365733444690704, "rewards/reward_financial_reasoning/mean": 0.2500000074505806, "rewards/reward_financial_reasoning/std": 0.46365734934806824, "step": 694, "step_time": 54.42009541150037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.375, "completions/clipped_ratio": 0.0625, "completions/max_length": 133.0, "completions/max_terminated_length": 8.5, "completions/mean_length": 23.375, "completions/mean_terminated_length": 7.794642925262451, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3098842386464826, "frac_reward_zero_std": 1.0, "grad_norm": 0.5152590274810791, "kl": 2.381638616323471, "learning_rate": 3.837209302325582e-06, "loss": 0.09065329283475876, "num_tokens": 6196249.0, "reward": -0.02499999850988388, "reward_std": 0.5077963322401047, "rewards/reward_financial_reasoning/mean": -0.02499999850988388, "rewards/reward_financial_reasoning/std": 0.5077963322401047, "step": 696, "step_time": 46.216524112001935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.5, "completions/max_terminated_length": 58.5, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.3107747105966162, "frac_reward_zero_std": 1.0, "grad_norm": 0.3281242847442627, "kl": 2.509884476661682, "learning_rate": 3.832261256803563e-06, "loss": 0.09444789588451385, "num_tokens": 6215585.0, "reward": -0.30000000447034836, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -0.30000000447034836, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 698, "step_time": 27.547750868001458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 25.1875, "completions/mean_terminated_length": 25.1875, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.3116651825467498, "frac_reward_zero_std": 1.0, "grad_norm": 0.1636444628238678, "kl": 2.3138733953237534, "learning_rate": 3.827313211281544e-06, "loss": 0.07426071166992188, "num_tokens": 6238084.0, "reward": 0.17500000447034836, "reward_std": 0.29398737102746964, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2939873933792114, "step": 700, "step_time": 25.016320683502272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 50.5, "completions/max_terminated_length": 50.5, "completions/mean_length": 22.3125, "completions/mean_terminated_length": 22.3125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.31255565449688333, "frac_reward_zero_std": 1.0, "grad_norm": 0.051183633506298065, "kl": 2.2624500691890717, "learning_rate": 3.822365165759525e-06, "loss": 0.08230020105838776, "num_tokens": 6255713.0, "reward": 0.17500000447034836, "reward_std": 0.29398737102746964, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2939873933792114, "step": 702, "step_time": 23.571445789493737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.3134461264470169, "frac_reward_zero_std": 0.75, "grad_norm": 0.08755451440811157, "kl": 2.5206730663776398, "learning_rate": 3.817417120237507e-06, "loss": 0.19678334891796112, "num_tokens": 6277597.0, "reward": 0.32500001043081284, "reward_std": 0.38347896933555603, "rewards/reward_financial_reasoning/mean": 0.32500001043081284, "rewards/reward_financial_reasoning/std": 0.3834789991378784, "step": 704, "step_time": 25.50389101500332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 75.125, "completions/clipped_ratio": 0.25, "completions/max_length": 143.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 75.125, "completions/mean_terminated_length": 14.625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3143365983971505, "frac_reward_zero_std": 1.0, "grad_norm": 0.20571209490299225, "kl": 2.407316707074642, "learning_rate": 3.8124690747154876e-06, "loss": 0.09111398458480835, "num_tokens": 6293983.0, "reward": -0.15000000037252903, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -0.15000000037252903, "rewards/reward_financial_reasoning/std": 0.10690449923276901, "step": 706, "step_time": 47.3297834570003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 21.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.31522707034728403, "frac_reward_zero_std": 1.0, "grad_norm": 0.12484844774007797, "kl": 3.3755542635917664, "learning_rate": 3.807521029193469e-06, "loss": 0.13404133915901184, "num_tokens": 6309839.0, "reward": -0.10000000521540642, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": -0.10000000521540642, "rewards/reward_financial_reasoning/std": 0.0, "step": 708, "step_time": 16.799556064499484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 14.8125, "completions/mean_terminated_length": 14.8125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.3161175422974176, "frac_reward_zero_std": 0.75, "grad_norm": 2.978635787963867, "kl": 3.1937762200832367, "learning_rate": 3.8025729836714504e-06, "loss": 0.08270812034606934, "num_tokens": 6327140.0, "reward": 0.23750001192092896, "reward_std": 0.5119454711675644, "rewards/reward_financial_reasoning/mean": 0.23750001192092896, "rewards/reward_financial_reasoning/std": 0.5119454860687256, "step": 710, "step_time": 16.252217650497187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 74.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 74.0, "completions/mean_terminated_length": 12.285715103149414, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.3170080142475512, "frac_reward_zero_std": 0.5, "grad_norm": 2.725724697113037, "kl": 3.6923981085419655, "learning_rate": 3.7976249381494313e-06, "loss": 0.07456135749816895, "num_tokens": 6344436.0, "reward": -0.17499999701976776, "reward_std": 0.3752485066652298, "rewards/reward_financial_reasoning/mean": -0.17499999701976776, "rewards/reward_financial_reasoning/std": 0.375248521566391, "step": 712, "step_time": 80.90963733000171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 88.0625, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 15.5, "completions/mean_length": 88.0625, "completions/mean_terminated_length": 12.43333387374878, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.3178984861976848, "frac_reward_zero_std": 0.75, "grad_norm": 0.1844349503517151, "kl": 2.4319792985916138, "learning_rate": 3.7926768926274127e-06, "loss": 0.11355370283126831, "num_tokens": 6365565.0, "reward": -0.07499999739229679, "reward_std": 0.36553528159856796, "rewards/reward_financial_reasoning/mean": -0.07499999739229679, "rewards/reward_financial_reasoning/std": 0.36553528159856796, "step": 714, "step_time": 83.99197188099424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 100.5, "completions/max_terminated_length": 100.5, "completions/mean_length": 44.5625, "completions/mean_terminated_length": 44.5625, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.3187889581478183, "frac_reward_zero_std": 1.0, "grad_norm": 16.39630699157715, "kl": 2.6740624085068703, "learning_rate": 3.7877288471053937e-06, "loss": 0.10633575171232224, "num_tokens": 6382526.0, "reward": 0.17500000447034836, "reward_std": 0.29398737102746964, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2939873933792114, "step": 716, "step_time": 46.1464865390044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.25, "completions/clipped_ratio": 0.0625, "completions/max_length": 140.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 33.25, "completions/mean_terminated_length": 18.142857551574707, "completions/min_length": 16.5, "completions/min_terminated_length": 16.5, "epoch": 0.3196794300979519, "frac_reward_zero_std": 1.0, "grad_norm": 0.09907568991184235, "kl": 3.2122897580266, "learning_rate": 3.782780801583375e-06, "loss": 0.12161869555711746, "num_tokens": 6399386.0, "reward": 0.07500000298023224, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.07500000298023224, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 718, "step_time": 46.925280134506465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 54.375, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 54.375, "completions/mean_terminated_length": 54.375, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.3205699020480855, "frac_reward_zero_std": 1.0, "grad_norm": 0.2224346101284027, "kl": 1.7517793476581573, "learning_rate": 3.777832756061356e-06, "loss": 0.052426815032958984, "num_tokens": 6421864.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 720, "step_time": 73.51747539999997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.0625, "completions/clipped_ratio": 0.0625, "completions/max_length": 139.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 32.0625, "completions/mean_terminated_length": 16.830357551574707, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.32146037399821903, "frac_reward_zero_std": 1.0, "grad_norm": 2.532606601715088, "kl": 2.519416108727455, "learning_rate": 3.7728847105393374e-06, "loss": 0.09540431946516037, "num_tokens": 6433569.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 722, "step_time": 44.58530493949365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.875, "completions/clipped_ratio": 0.0625, "completions/max_length": 141.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 27.875, "completions/mean_terminated_length": 12.196428775787354, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.3223508459483526, "frac_reward_zero_std": 1.0, "grad_norm": 0.13162082433700562, "kl": 5.542090013623238, "learning_rate": 3.7679366650173183e-06, "loss": 0.21324403584003448, "num_tokens": 6446543.0, "reward": 0.1750000026077032, "reward_std": 0.40089186280965805, "rewards/reward_financial_reasoning/mean": 0.1750000026077032, "rewards/reward_financial_reasoning/std": 0.40089186280965805, "step": 724, "step_time": 46.05484523400082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.125, "completions/clipped_ratio": 0.0625, "completions/max_length": 145.5, "completions/max_terminated_length": 25.0, "completions/mean_length": 32.125, "completions/mean_terminated_length": 16.66964292526245, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3232413178984862, "frac_reward_zero_std": 0.75, "grad_norm": 1.7261240482330322, "kl": 4.556912407279015, "learning_rate": 3.7629886194952997e-06, "loss": 0.23144212365150452, "num_tokens": 6454881.0, "reward": 0.5374999940395355, "reward_std": 0.47036218643188477, "rewards/reward_financial_reasoning/mean": 0.5374999940395355, "rewards/reward_financial_reasoning/std": 0.47036220133304596, "step": 726, "step_time": 43.249197524495685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.625, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 20.625, "completions/mean_terminated_length": 20.625, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.3241317898486198, "frac_reward_zero_std": 1.0, "grad_norm": 0.1306663453578949, "kl": 2.114730104804039, "learning_rate": 3.7580405739732806e-06, "loss": 0.08357012271881104, "num_tokens": 6470147.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 728, "step_time": 21.64191951349494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 26.8125, "completions/mean_terminated_length": 26.8125, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.3250222617987533, "frac_reward_zero_std": 1.0, "grad_norm": 0.6427158117294312, "kl": 2.4815937876701355, "learning_rate": 3.753092528451262e-06, "loss": 0.0976918414235115, "num_tokens": 6492520.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 730, "step_time": 24.04095314299775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 61.25, "completions/clipped_ratio": 0.1875, "completions/max_length": 135.0, "completions/max_terminated_length": 19.5, "completions/mean_length": 61.25, "completions/mean_terminated_length": 17.487500190734863, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.3259127337488869, "frac_reward_zero_std": 0.75, "grad_norm": 0.49971044063568115, "kl": 2.464243160560727, "learning_rate": 3.7481444829292434e-06, "loss": 0.20906202495098114, "num_tokens": 6518100.0, "reward": -0.16250000149011612, "reward_std": 0.34886594116687775, "rewards/reward_financial_reasoning/mean": -0.16250000149011612, "rewards/reward_financial_reasoning/std": 0.34886594116687775, "step": 732, "step_time": 53.07256942349704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.375, "completions/clipped_ratio": 0.0625, "completions/max_length": 178.5, "completions/max_terminated_length": 64.0, "completions/mean_length": 44.375, "completions/mean_terminated_length": 29.98214340209961, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.3268032056990205, "frac_reward_zero_std": 0.5, "grad_norm": 1.2685043811798096, "kl": 2.9134768545627594, "learning_rate": 3.7431964374072243e-06, "loss": 0.045311249792575836, "num_tokens": 6540474.0, "reward": 0.012500000186264515, "reward_std": 0.35391390323638916, "rewards/reward_financial_reasoning/mean": 0.012500000186264515, "rewards/reward_financial_reasoning/std": 0.35391390323638916, "step": 734, "step_time": 62.79009689700615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.0625, "completions/clipped_ratio": 0.25, "completions/max_length": 147.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 76.0625, "completions/mean_terminated_length": 16.5625, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.327693677649154, "frac_reward_zero_std": 0.75, "grad_norm": 0.8403189778327942, "kl": 2.6961580216884613, "learning_rate": 3.7382483918852057e-06, "loss": 0.08386936783790588, "num_tokens": 6554147.0, "reward": 0.2875000089406967, "reward_std": 0.49708831310272217, "rewards/reward_financial_reasoning/mean": 0.2875000089406967, "rewards/reward_financial_reasoning/std": 0.49708834290504456, "step": 736, "step_time": 47.54676332350209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.25, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 9.880952596664429, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.3285841495992876, "frac_reward_zero_std": 0.75, "grad_norm": 0.19178739190101624, "kl": 7.0429524183273315, "learning_rate": 3.7333003463631866e-06, "loss": 0.14826089143753052, "num_tokens": 6569375.0, "reward": 0.48749998956918716, "reward_std": 0.2176603004336357, "rewards/reward_financial_reasoning/mean": 0.48749998956918716, "rewards/reward_financial_reasoning/std": 0.2176603153347969, "step": 738, "step_time": 77.7282326079985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.25, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.5, "completions/max_terminated_length": 14.0, "completions/mean_length": 26.25, "completions/mean_terminated_length": 10.723214387893677, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3294746215494212, "frac_reward_zero_std": 1.0, "grad_norm": 4.231341361999512, "kl": 2.5500387847423553, "learning_rate": 3.728352300841168e-06, "loss": 0.09851864725351334, "num_tokens": 6586515.0, "reward": 0.05000000260770321, "reward_std": 0.42761795967817307, "rewards/reward_financial_reasoning/mean": 0.05000000260770321, "rewards/reward_financial_reasoning/std": 0.42761795967817307, "step": 740, "step_time": 47.18414942449817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 34.5, "completions/max_terminated_length": 34.5, "completions/mean_length": 16.5625, "completions/mean_terminated_length": 16.5625, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.3303650934995548, "frac_reward_zero_std": 1.0, "grad_norm": 0.2784968912601471, "kl": 2.6422544419765472, "learning_rate": 3.723404255319149e-06, "loss": 0.09954223781824112, "num_tokens": 6605620.0, "reward": -0.30000000447034836, "reward_std": 0.21380899101495743, "rewards/reward_financial_reasoning/mean": -0.30000000447034836, "rewards/reward_financial_reasoning/std": 0.21380899101495743, "step": 742, "step_time": 19.979887659497763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.4375, "completions/mean_terminated_length": 12.4375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3312555654496883, "frac_reward_zero_std": 1.0, "grad_norm": 0.17147980630397797, "kl": 3.159319758415222, "learning_rate": 3.7184562097971303e-06, "loss": 0.1264461874961853, "num_tokens": 6625067.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 744, "step_time": 14.4231962595004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 43.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 132.5, "completions/max_terminated_length": 132.5, "completions/mean_length": 43.9375, "completions/mean_terminated_length": 43.9375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3321460373998219, "frac_reward_zero_std": 0.75, "grad_norm": 0.28793638944625854, "kl": 6.326580494642258, "learning_rate": 3.7135081642751113e-06, "loss": 0.20715469121932983, "num_tokens": 6640762.0, "reward": 0.4375000149011612, "reward_std": 0.44363605976104736, "rewards/reward_financial_reasoning/mean": 0.4375000149011612, "rewards/reward_financial_reasoning/std": 0.44363610446453094, "step": 746, "step_time": 43.157302853003785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.625, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.5, "completions/max_terminated_length": 40.5, "completions/mean_length": 40.625, "completions/mean_terminated_length": 26.8125, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.3330365093499555, "frac_reward_zero_std": 1.0, "grad_norm": 0.47648006677627563, "kl": 2.5065064430236816, "learning_rate": 3.7085601187530927e-06, "loss": 0.09898876398801804, "num_tokens": 6663932.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 748, "step_time": 51.96744587050125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5, "completions/clipped_ratio": 0.0625, "completions/max_length": 143.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 27.48214340209961, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.333926981300089, "frac_reward_zero_std": 0.75, "grad_norm": 2.8360135555267334, "kl": 1.4322513043880463, "learning_rate": 3.7036120732310745e-06, "loss": 0.07792390882968903, "num_tokens": 6685132.0, "reward": 0.26250001043081284, "reward_std": 0.3512909263372421, "rewards/reward_financial_reasoning/mean": 0.26250001043081284, "rewards/reward_financial_reasoning/std": 0.3512909561395645, "step": 750, "step_time": 51.64344793999771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 32.5, "completions/max_terminated_length": 32.5, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.3348174532502226, "frac_reward_zero_std": 1.0, "grad_norm": 0.11603382974863052, "kl": 2.018613636493683, "learning_rate": 3.698664027709055e-06, "loss": 0.09922308474779129, "num_tokens": 6706376.0, "reward": 0.02499999850988388, "reward_std": 0.45434410870075226, "rewards/reward_financial_reasoning/mean": 0.02499999850988388, "rewards/reward_financial_reasoning/std": 0.45434410870075226, "step": 752, "step_time": 20.996847507503844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.75, "completions/clipped_ratio": 0.0625, "completions/max_length": 140.0, "completions/max_terminated_length": 22.5, "completions/mean_length": 35.75, "completions/mean_terminated_length": 20.9375, "completions/min_length": 19.5, "completions/min_terminated_length": 19.5, "epoch": 0.3357079252003562, "frac_reward_zero_std": 0.75, "grad_norm": 1.3406031131744385, "kl": 3.415686219930649, "learning_rate": 3.6937159821870368e-06, "loss": 0.23348277807235718, "num_tokens": 6727876.0, "reward": -0.13750001043081284, "reward_std": 0.2256779968738556, "rewards/reward_financial_reasoning/mean": -0.13750001043081284, "rewards/reward_financial_reasoning/std": 0.2256779968738556, "step": 754, "step_time": 51.19353974749902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.625, "completions/clipped_ratio": 0.0, "completions/max_length": 16.5, "completions/max_terminated_length": 16.5, "completions/mean_length": 10.625, "completions/mean_terminated_length": 10.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.3365983971504898, "frac_reward_zero_std": 1.0, "grad_norm": 0.23292890191078186, "kl": 3.3537650406360626, "learning_rate": 3.6887679366650177e-06, "loss": 0.12877531349658966, "num_tokens": 6742294.0, "reward": 0.30000001192092896, "reward_std": 0.5345224589109421, "rewards/reward_financial_reasoning/mean": 0.30000001192092896, "rewards/reward_financial_reasoning/std": 0.5345224738121033, "step": 756, "step_time": 12.437249256501673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 209.5, "completions/max_terminated_length": 95.0, "completions/mean_length": 42.4375, "completions/mean_terminated_length": 27.973215103149414, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.3374888691006233, "frac_reward_zero_std": 0.75, "grad_norm": 2.973015785217285, "kl": 2.390896290540695, "learning_rate": 3.683819891142999e-06, "loss": 0.17431329190731049, "num_tokens": 6759933.0, "reward": 0.38750001788139343, "reward_std": 0.39018382132053375, "rewards/reward_financial_reasoning/mean": 0.38750001788139343, "rewards/reward_financial_reasoning/std": 0.39018386602401733, "step": 758, "step_time": 67.14941029350302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.1875, "completions/clipped_ratio": 0.125, "completions/max_length": 146.0, "completions/max_terminated_length": 30.5, "completions/mean_length": 50.1875, "completions/mean_terminated_length": 20.83333396911621, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3383793410507569, "frac_reward_zero_std": 0.75, "grad_norm": 0.08029485493898392, "kl": 2.803036607801914, "learning_rate": 3.67887184562098e-06, "loss": 0.13053226470947266, "num_tokens": 6777072.0, "reward": 0.0, "reward_std": 0.35675284266471863, "rewards/reward_financial_reasoning/mean": 0.0, "rewards/reward_financial_reasoning/std": 0.3567528575658798, "step": 760, "step_time": 49.21179148500232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 64.9375, "completions/clipped_ratio": 0.1875, "completions/max_length": 133.5, "completions/max_terminated_length": 95.5, "completions/mean_length": 64.9375, "completions/mean_terminated_length": 24.287500381469727, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3392698130008905, "frac_reward_zero_std": 1.0, "grad_norm": 0.07963932305574417, "kl": 1.750863203778863, "learning_rate": 3.6739238000989614e-06, "loss": 0.07047466188669205, "num_tokens": 6792711.0, "reward": 0.19999998807907104, "reward_std": 0.05345224589109421, "rewards/reward_financial_reasoning/mean": 0.19999998807907104, "rewards/reward_financial_reasoning/std": 0.05345224589109421, "step": 762, "step_time": 45.05649412400089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 32.3125, "completions/mean_terminated_length": 32.3125, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.340160284951024, "frac_reward_zero_std": 1.0, "grad_norm": 1.80032479763031, "kl": 2.7984056919813156, "learning_rate": 3.6689757545769424e-06, "loss": 0.09850553423166275, "num_tokens": 6815724.0, "reward": -0.15000000223517418, "reward_std": 0.05345224589109421, "rewards/reward_financial_reasoning/mean": -0.15000000223517418, "rewards/reward_financial_reasoning/std": 0.05345224589109421, "step": 764, "step_time": 29.943595519500377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 30.5, "completions/max_terminated_length": 30.5, "completions/mean_length": 20.3125, "completions/mean_terminated_length": 20.3125, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.3410507569011576, "frac_reward_zero_std": 1.0, "grad_norm": 1.911179780960083, "kl": 2.389465108513832, "learning_rate": 3.6640277090549238e-06, "loss": 0.0941983088850975, "num_tokens": 6831737.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 766, "step_time": 17.419960692499444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 15.8125, "completions/mean_terminated_length": 15.8125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3419412288512912, "frac_reward_zero_std": 1.0, "grad_norm": 1.5256670713424683, "kl": 3.1901561617851257, "learning_rate": 3.6590796635329047e-06, "loss": 0.12186402082443237, "num_tokens": 6847790.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 768, "step_time": 15.88296320849986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 37.5, "completions/max_terminated_length": 37.5, "completions/mean_length": 21.8125, "completions/mean_terminated_length": 21.8125, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.3428317008014248, "frac_reward_zero_std": 0.75, "grad_norm": 4.21360969543457, "kl": 2.3833318948745728, "learning_rate": 3.654131618010886e-06, "loss": 0.1340945065021515, "num_tokens": 6867195.0, "reward": 0.06250000093132257, "reward_std": 0.1060660183429718, "rewards/reward_financial_reasoning/mean": 0.06250000093132257, "rewards/reward_financial_reasoning/std": 0.1060660183429718, "step": 770, "step_time": 21.20947382099621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5625, "completions/clipped_ratio": 0.0625, "completions/max_length": 137.5, "completions/max_terminated_length": 26.0, "completions/mean_length": 31.5625, "completions/mean_terminated_length": 17.017857551574707, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3437221727515583, "frac_reward_zero_std": 0.75, "grad_norm": 0.20955723524093628, "kl": 2.9231202751398087, "learning_rate": 3.6491835724888675e-06, "loss": 0.17583100497722626, "num_tokens": 6884020.0, "reward": -0.0875000013038516, "reward_std": 0.266422763466835, "rewards/reward_financial_reasoning/mean": -0.0875000013038516, "rewards/reward_financial_reasoning/std": 0.266422763466835, "step": 772, "step_time": 47.02959246299724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 20.0, "completions/mean_terminated_length": 20.0, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.3446126447016919, "frac_reward_zero_std": 1.0, "grad_norm": 0.1046200692653656, "kl": 2.869453191757202, "learning_rate": 3.6442355269668484e-06, "loss": 0.1020328626036644, "num_tokens": 6904068.0, "reward": 0.17500000447034836, "reward_std": 0.40089185535907745, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.40089187026023865, "step": 774, "step_time": 24.619027701508458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 13.5, "completions/max_terminated_length": 13.5, "completions/mean_length": 12.4375, "completions/mean_terminated_length": 12.4375, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.3455031166518255, "frac_reward_zero_std": 1.0, "grad_norm": 0.09514451026916504, "kl": 22.426249851079774, "learning_rate": 3.6392874814448298e-06, "loss": 0.939132571220398, "num_tokens": 6919491.0, "reward": 0.6749999970197678, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": 0.6749999970197678, "rewards/reward_financial_reasoning/std": 0.24053513258695602, "step": 776, "step_time": 13.205288213495805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 137.0, "completions/max_terminated_length": 28.5, "completions/mean_length": 25.4375, "completions/mean_terminated_length": 10.312500476837158, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.346393588601959, "frac_reward_zero_std": 0.75, "grad_norm": 0.6481949090957642, "kl": 3.8522638976573944, "learning_rate": 3.6343394359228107e-06, "loss": 0.21328844130039215, "num_tokens": 6936298.0, "reward": 0.23750001192092896, "reward_std": 0.5505405366420746, "rewards/reward_financial_reasoning/mean": 0.23750001192092896, "rewards/reward_financial_reasoning/std": 0.550540566444397, "step": 778, "step_time": 47.54922737349989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 9.9375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3472840605520926, "frac_reward_zero_std": 0.75, "grad_norm": 21.361452102661133, "kl": 6.1261356472969055, "learning_rate": 3.629391390400792e-06, "loss": 0.28381842374801636, "num_tokens": 6956857.0, "reward": 0.6124999970197678, "reward_std": 0.39018382132053375, "rewards/reward_financial_reasoning/mean": 0.6124999970197678, "rewards/reward_financial_reasoning/std": 0.39018385112285614, "step": 780, "step_time": 16.095249286994658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.25, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 14.5, "completions/mean_length": 42.25, "completions/mean_terminated_length": 11.714285850524902, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3481745325022262, "frac_reward_zero_std": 0.75, "grad_norm": 5.043459415435791, "kl": 2.4115554690361023, "learning_rate": 3.624443344878773e-06, "loss": 0.18826088309288025, "num_tokens": 6980629.0, "reward": 0.2874999865889549, "reward_std": 0.36345769464969635, "rewards/reward_financial_reasoning/mean": 0.2874999865889549, "rewards/reward_financial_reasoning/std": 0.36345770955085754, "step": 782, "step_time": 86.04697552700236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.875, "completions/clipped_ratio": 0.0625, "completions/max_length": 134.0, "completions/max_terminated_length": 27.5, "completions/mean_length": 28.875, "completions/mean_terminated_length": 14.008929252624512, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3490650044523598, "frac_reward_zero_std": 0.75, "grad_norm": 0.16503266990184784, "kl": 1.425231909379363, "learning_rate": 3.6194952993567544e-06, "loss": -0.029601268470287323, "num_tokens": 6996299.0, "reward": 0.03749999403953552, "reward_std": 0.5514859259128571, "rewards/reward_financial_reasoning/mean": 0.03749999403953552, "rewards/reward_financial_reasoning/std": 0.5514859408140182, "step": 784, "step_time": 46.38959728450209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 108.875, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 32.5, "completions/mean_length": 108.875, "completions/mean_terminated_length": 18.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3499554764024933, "frac_reward_zero_std": 0.75, "grad_norm": 2.0817203521728516, "kl": 2.4215634018182755, "learning_rate": 3.6145472538347354e-06, "loss": 0.07867254316806793, "num_tokens": 7012225.0, "reward": 0.03749999403953552, "reward_std": 0.35606882721185684, "rewards/reward_financial_reasoning/mean": 0.03749999403953552, "rewards/reward_financial_reasoning/std": 0.35606882721185684, "step": 786, "step_time": 77.41537013850393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 53.5, "completions/max_terminated_length": 53.5, "completions/mean_length": 19.9375, "completions/mean_terminated_length": 19.9375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.3508459483526269, "frac_reward_zero_std": 1.0, "grad_norm": 1.1960225105285645, "kl": 3.1713290363550186, "learning_rate": 3.6095992083127167e-06, "loss": 0.11753413081169128, "num_tokens": 7032832.0, "reward": 0.3499999865889549, "reward_std": 0.21380899101495743, "rewards/reward_financial_reasoning/mean": 0.3499999865889549, "rewards/reward_financial_reasoning/std": 0.21380899101495743, "step": 788, "step_time": 26.475698017497052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 30.5, "completions/max_terminated_length": 30.5, "completions/mean_length": 18.1875, "completions/mean_terminated_length": 18.1875, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.3517364203027605, "frac_reward_zero_std": 1.0, "grad_norm": 0.3880739212036133, "kl": 1.8633519411087036, "learning_rate": 3.6046511627906977e-06, "loss": 0.07043511420488358, "num_tokens": 7052539.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 790, "step_time": 19.250361710492143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 51.0, "completions/clipped_ratio": 0.125, "completions/max_length": 144.5, "completions/max_terminated_length": 36.5, "completions/mean_length": 51.0, "completions/mean_terminated_length": 21.08333396911621, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.352626892252894, "frac_reward_zero_std": 0.75, "grad_norm": 5.956282138824463, "kl": 3.7996502816677094, "learning_rate": 3.599703117268679e-06, "loss": 0.10686469823122025, "num_tokens": 7075259.0, "reward": 0.20000000670552254, "reward_std": 0.3760698735713959, "rewards/reward_financial_reasoning/mean": 0.20000000670552254, "rewards/reward_financial_reasoning/std": 0.37606990337371826, "step": 792, "step_time": 52.96954656899834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.125, "completions/clipped_ratio": 0.0625, "completions/max_length": 140.0, "completions/max_terminated_length": 34.0, "completions/mean_length": 30.125, "completions/mean_terminated_length": 15.428571701049805, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.3535173642030276, "frac_reward_zero_std": 0.75, "grad_norm": 1997.9693603515625, "kl": 342.5339174568653, "learning_rate": 3.5947550717466604e-06, "loss": 9.447096824645996, "num_tokens": 7092589.0, "reward": 0.36250001192092896, "reward_std": 0.5238144397735596, "rewards/reward_financial_reasoning/mean": 0.36250001192092896, "rewards/reward_financial_reasoning/std": 0.523814469575882, "step": 794, "step_time": 47.50602113500645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 55.5, "completions/max_terminated_length": 55.5, "completions/mean_length": 17.5625, "completions/mean_terminated_length": 17.5625, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.3544078361531612, "frac_reward_zero_std": 1.0, "grad_norm": 0.4262450933456421, "kl": 2.2002196609973907, "learning_rate": 3.5898070262246414e-06, "loss": 0.09302681684494019, "num_tokens": 7111166.0, "reward": -0.2749999985098839, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.2749999985098839, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 796, "step_time": 25.643957539497933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.0, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 16.0, "completions/mean_terminated_length": 16.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.35529830810329477, "frac_reward_zero_std": 1.0, "grad_norm": 0.12062408030033112, "kl": 2.885861501097679, "learning_rate": 3.5848589807026228e-06, "loss": 0.11578873544931412, "num_tokens": 7133662.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 798, "step_time": 18.64683330200205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.9375, "completions/clipped_ratio": 0.0625, "completions/max_length": 182.0, "completions/max_terminated_length": 63.5, "completions/mean_length": 34.9375, "completions/mean_terminated_length": 19.535714626312256, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3561887800534283, "frac_reward_zero_std": 0.75, "grad_norm": 0.21271134912967682, "kl": 2.7550200819969177, "learning_rate": 3.5799109351806037e-06, "loss": 0.15305285155773163, "num_tokens": 7154069.0, "reward": -0.03750000149011612, "reward_std": 0.4317670986056328, "rewards/reward_financial_reasoning/mean": -0.03750000149011612, "rewards/reward_financial_reasoning/std": 0.4317671060562134, "step": 800, "step_time": 60.9049749980004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 158.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 49.4375, "completions/mean_terminated_length": 35.66964340209961, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.3570792520035619, "frac_reward_zero_std": 1.0, "grad_norm": 0.05105382576584816, "kl": 1.9183711856603622, "learning_rate": 3.574962889658585e-06, "loss": 0.05830772966146469, "num_tokens": 7169716.0, "reward": -0.04999999701976776, "reward_std": 0.37416573613882065, "rewards/reward_financial_reasoning/mean": -0.04999999701976776, "rewards/reward_financial_reasoning/std": 0.37416573613882065, "step": 802, "step_time": 53.53614674049095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 107.375, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 20.5, "completions/mean_length": 107.375, "completions/mean_terminated_length": 19.166666984558105, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.3579697239536955, "frac_reward_zero_std": 0.75, "grad_norm": 1.246635913848877, "kl": 25.036666467785835, "learning_rate": 3.570014844136566e-06, "loss": 1.1702524423599243, "num_tokens": 7187594.0, "reward": 0.07500000111758709, "reward_std": 0.276574470102787, "rewards/reward_financial_reasoning/mean": 0.07500000111758709, "rewards/reward_financial_reasoning/std": 0.2765744850039482, "step": 804, "step_time": 80.11308830999769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.5625, "completions/clipped_ratio": 0.125, "completions/max_length": 140.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 46.5625, "completions/mean_terminated_length": 16.64583396911621, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.358860195903829, "frac_reward_zero_std": 0.75, "grad_norm": 0.19011610746383667, "kl": 2.6784499436616898, "learning_rate": 3.565066798614548e-06, "loss": 0.20479558408260345, "num_tokens": 7196979.0, "reward": 0.17500000447034836, "reward_std": 0.2314550280570984, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2314550280570984, "step": 806, "step_time": 43.39767270999437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.6875, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 29.6875, "completions/mean_terminated_length": 14.875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.3597506678539626, "frac_reward_zero_std": 1.0, "grad_norm": 1.1912928819656372, "kl": 3.8488663136959076, "learning_rate": 3.5601187530925284e-06, "loss": 0.1310974359512329, "num_tokens": 7214110.0, "reward": 0.6250000149011612, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.6250000149011612, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 808, "step_time": 48.412980310498824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 18.6875, "completions/mean_terminated_length": 18.6875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3606411398040962, "frac_reward_zero_std": 1.0, "grad_norm": 0.11078284680843353, "kl": 2.806388795375824, "learning_rate": 3.55517070757051e-06, "loss": 0.10246068984270096, "num_tokens": 7233401.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 810, "step_time": 18.20412028949795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.625, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 17.625, "completions/mean_terminated_length": 17.625, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.36153161175422976, "frac_reward_zero_std": 1.0, "grad_norm": 0.17546121776103973, "kl": 2.027177184820175, "learning_rate": 3.5502226620484907e-06, "loss": 0.08124741911888123, "num_tokens": 7251419.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 812, "step_time": 17.06027586250275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 49.6875, "completions/clipped_ratio": 0.125, "completions/max_length": 138.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 49.6875, "completions/mean_terminated_length": 20.33333396911621, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3624220837043633, "frac_reward_zero_std": 0.75, "grad_norm": 0.1511795073747635, "kl": 2.3284588307142258, "learning_rate": 3.5452746165264725e-06, "loss": 0.19820040464401245, "num_tokens": 7269582.0, "reward": 0.02500000037252903, "reward_std": 0.13887301087379456, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.13887301087379456, "step": 814, "step_time": 47.642690775999654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 26.5, "completions/max_terminated_length": 26.5, "completions/mean_length": 20.5625, "completions/mean_terminated_length": 20.5625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3633125556544969, "frac_reward_zero_std": 1.0, "grad_norm": 0.18224821984767914, "kl": 3.339336931705475, "learning_rate": 3.540326571004454e-06, "loss": 0.13968926668167114, "num_tokens": 7287047.0, "reward": -0.20000001043081284, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -0.20000001043081284, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 816, "step_time": 16.942964851496072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 83.5625, "completions/clipped_ratio": 0.25, "completions/max_length": 148.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 83.5625, "completions/mean_terminated_length": 22.5625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.36420302760463047, "frac_reward_zero_std": 0.75, "grad_norm": 1.5113762617111206, "kl": 1.7933121919631958, "learning_rate": 3.535378525482435e-06, "loss": 0.08266621828079224, "num_tokens": 7302512.0, "reward": 0.17500000074505806, "reward_std": 0.4242233335971832, "rewards/reward_financial_reasoning/mean": 0.17500000074505806, "rewards/reward_financial_reasoning/std": 0.4242233335971832, "step": 818, "step_time": 47.5636702519987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.875, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 16.875, "completions/mean_terminated_length": 16.875, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.365093499554764, "frac_reward_zero_std": 1.0, "grad_norm": 0.2267179638147354, "kl": 4.005930542945862, "learning_rate": 3.530430479960416e-06, "loss": 0.1383998841047287, "num_tokens": 7322222.0, "reward": -0.12500000186264515, "reward_std": 0.24053511768579483, "rewards/reward_financial_reasoning/mean": -0.12500000186264515, "rewards/reward_financial_reasoning/std": 0.24053511768579483, "step": 820, "step_time": 19.202351930998702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.75, "completions/clipped_ratio": 0.0, "completions/max_length": 32.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 23.75, "completions/mean_terminated_length": 23.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3659839715048976, "frac_reward_zero_std": 1.0, "grad_norm": 0.08866865932941437, "kl": 2.016817420721054, "learning_rate": 3.525482434438397e-06, "loss": 0.08069662004709244, "num_tokens": 7339330.0, "reward": 0.17500000447034836, "reward_std": 0.29398737102746964, "rewards/reward_financial_reasoning/mean": 0.17500000447034836, "rewards/reward_financial_reasoning/std": 0.2939873933792114, "step": 822, "step_time": 18.552810501500062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 10.375, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.375, "completions/mean_terminated_length": 10.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.3668744434550312, "frac_reward_zero_std": 1.0, "grad_norm": 0.1785629242658615, "kl": 2.7913960814476013, "learning_rate": 3.5205343889163785e-06, "loss": 0.11267250776290894, "num_tokens": 7356440.0, "reward": -0.20000000298023224, "reward_std": 0.32071349024772644, "rewards/reward_financial_reasoning/mean": -0.20000000298023224, "rewards/reward_financial_reasoning/std": 0.32071349024772644, "step": 824, "step_time": 13.125887238500582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 15.4375, "completions/mean_terminated_length": 15.4375, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.36776491540516476, "frac_reward_zero_std": 1.0, "grad_norm": 0.34747812151908875, "kl": 2.4959478676319122, "learning_rate": 3.5155863433943594e-06, "loss": 0.09693935513496399, "num_tokens": 7375239.0, "reward": 0.1250000074505806, "reward_std": 0.34743961691856384, "rewards/reward_financial_reasoning/mean": 0.1250000074505806, "rewards/reward_financial_reasoning/std": 0.34743963181972504, "step": 826, "step_time": 16.67322523299299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.375, "completions/clipped_ratio": 0.0, "completions/max_length": 17.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 11.375, "completions/mean_terminated_length": 11.375, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.3686553873552983, "frac_reward_zero_std": 0.75, "grad_norm": 0.09147421270608902, "kl": 4.733070477843285, "learning_rate": 3.510638297872341e-06, "loss": 0.16881364583969116, "num_tokens": 7393853.0, "reward": 0.13750000670552254, "reward_std": 0.36228442192077637, "rewards/reward_financial_reasoning/mean": 0.13750000670552254, "rewards/reward_financial_reasoning/std": 0.36228442192077637, "step": 828, "step_time": 15.064109992996237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 15.5625, "completions/mean_terminated_length": 15.5625, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.3695458593054319, "frac_reward_zero_std": 0.75, "grad_norm": 2.8771681785583496, "kl": 3.5439845621585846, "learning_rate": 3.5056902523503218e-06, "loss": 0.15652626752853394, "num_tokens": 7415678.0, "reward": 0.10000000149011612, "reward_std": 0.1963960975408554, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.1963960975408554, "step": 830, "step_time": 18.503596431502956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 22.4375, "completions/mean_terminated_length": 22.4375, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.37043633125556547, "frac_reward_zero_std": 0.75, "grad_norm": 0.17112646996974945, "kl": 2.819959133863449, "learning_rate": 3.500742206828303e-06, "loss": 0.12506897747516632, "num_tokens": 7429525.0, "reward": 0.20000000670552254, "reward_std": 0.2905927151441574, "rewards/reward_financial_reasoning/mean": 0.20000000670552254, "rewards/reward_financial_reasoning/std": 0.2905927300453186, "step": 832, "step_time": 14.886599326004216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 21.5, "completions/max_terminated_length": 21.5, "completions/mean_length": 15.8125, "completions/mean_terminated_length": 15.8125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.371326803205699, "frac_reward_zero_std": 1.0, "grad_norm": 0.14576126635074615, "kl": 2.9340180456638336, "learning_rate": 3.4957941613062845e-06, "loss": 0.11859500408172607, "num_tokens": 7448242.0, "reward": -0.15000000223517418, "reward_std": 0.05345224589109421, "rewards/reward_financial_reasoning/mean": -0.15000000223517418, "rewards/reward_financial_reasoning/std": 0.05345224589109421, "step": 834, "step_time": 16.496135844503442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.5, "completions/max_terminated_length": 42.5, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 22.5, "completions/min_terminated_length": 22.5, "epoch": 0.3722172751558326, "frac_reward_zero_std": 1.0, "grad_norm": 0.11725213378667831, "kl": 1.9590845555067062, "learning_rate": 3.4908461157842655e-06, "loss": 0.07787135988473892, "num_tokens": 7465378.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 836, "step_time": 21.61521376399469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 32.5, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 17.589285850524902, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.37310774710596617, "frac_reward_zero_std": 0.75, "grad_norm": 2.323338031768799, "kl": 2.400451347231865, "learning_rate": 3.485898070262247e-06, "loss": 0.17991583049297333, "num_tokens": 7479754.0, "reward": 0.2875000089406967, "reward_std": 0.4670701175928116, "rewards/reward_financial_reasoning/mean": 0.2875000089406967, "rewards/reward_financial_reasoning/std": 0.4670701324939728, "step": 838, "step_time": 46.93090354149899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 87.5, "completions/max_terminated_length": 87.5, "completions/mean_length": 30.5625, "completions/mean_terminated_length": 30.5625, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.37399821905609976, "frac_reward_zero_std": 1.0, "grad_norm": 0.14068838953971863, "kl": 2.13110613822937, "learning_rate": 3.480950024740228e-06, "loss": 0.08287981897592545, "num_tokens": 7502115.0, "reward": 0.1250000037252903, "reward_std": 0.34743958711624146, "rewards/reward_financial_reasoning/mean": 0.1250000037252903, "rewards/reward_financial_reasoning/std": 0.34743958711624146, "step": 840, "step_time": 38.0494445104996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 35.5, "completions/max_terminated_length": 35.5, "completions/mean_length": 27.1875, "completions/mean_terminated_length": 27.1875, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.3748886910062333, "frac_reward_zero_std": 1.0, "grad_norm": 0.2852267026901245, "kl": 2.796256124973297, "learning_rate": 3.476001979218209e-06, "loss": 0.11120793223381042, "num_tokens": 7519318.0, "reward": 0.07500000298023224, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.07500000298023224, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 842, "step_time": 19.269342353996763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.375, "completions/clipped_ratio": 0.0, "completions/max_length": 20.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 13.375, "completions/mean_terminated_length": 13.375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3757791629563669, "frac_reward_zero_std": 1.0, "grad_norm": 0.4274088740348816, "kl": 3.7996322214603424, "learning_rate": 3.47105393369619e-06, "loss": 0.12690269947052002, "num_tokens": 7541116.0, "reward": 0.1250000074505806, "reward_std": 0.34743961691856384, "rewards/reward_financial_reasoning/mean": 0.1250000074505806, "rewards/reward_financial_reasoning/std": 0.34743963181972504, "step": 844, "step_time": 18.219404364499496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.1875, "completions/clipped_ratio": 0.0, "completions/max_length": 17.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 13.1875, "completions/mean_terminated_length": 13.1875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.37666963490650046, "frac_reward_zero_std": 1.0, "grad_norm": 0.23236680030822754, "kl": 2.7688030302524567, "learning_rate": 3.4661058881741715e-06, "loss": 0.10391891002655029, "num_tokens": 7559863.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 846, "step_time": 15.469687197000894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.1875, "completions/clipped_ratio": 0.0625, "completions/max_length": 157.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 37.1875, "completions/mean_terminated_length": 21.991071701049805, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.377560106856634, "frac_reward_zero_std": 1.0, "grad_norm": 0.13132241368293762, "kl": 2.3280729204416275, "learning_rate": 3.4611578426521524e-06, "loss": 0.07760872691869736, "num_tokens": 7581346.0, "reward": 0.10000000894069672, "reward_std": 0.21380899846553802, "rewards/reward_financial_reasoning/mean": 0.10000000894069672, "rewards/reward_financial_reasoning/std": 0.21380901336669922, "step": 848, "step_time": 57.75018220250058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 44.5, "completions/clipped_ratio": 0.125, "completions/max_length": 134.5, "completions/max_terminated_length": 15.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 14.625, "completions/min_length": 14.5, "completions/min_terminated_length": 14.5, "epoch": 0.3784505788067676, "frac_reward_zero_std": 0.75, "grad_norm": 1.0872406959533691, "kl": 1.559784710407257, "learning_rate": 3.456209797130134e-06, "loss": 0.11022936552762985, "num_tokens": 7594410.0, "reward": 0.2500000074505806, "reward_std": 0.2577935457229614, "rewards/reward_financial_reasoning/mean": 0.2500000074505806, "rewards/reward_financial_reasoning/std": 0.2577935680747032, "step": 850, "step_time": 45.99375293400226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 134.5, "completions/max_terminated_length": 93.0, "completions/mean_length": 37.0, "completions/mean_terminated_length": 23.4375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.37934105075690117, "frac_reward_zero_std": 0.75, "grad_norm": 12.833792686462402, "kl": 3.972812756896019, "learning_rate": 3.4512617516081148e-06, "loss": 0.2475769817829132, "num_tokens": 7615866.0, "reward": -0.16249999683350325, "reward_std": 0.1862443909049034, "rewards/reward_financial_reasoning/mean": -0.16249999683350325, "rewards/reward_financial_reasoning/std": 0.1862443909049034, "step": 852, "step_time": 51.58180875749531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 47.5, "completions/max_terminated_length": 47.5, "completions/mean_length": 23.3125, "completions/mean_terminated_length": 23.3125, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.38023152270703475, "frac_reward_zero_std": 1.0, "grad_norm": 0.23057197034358978, "kl": 3.2737006843090057, "learning_rate": 3.446313706086096e-06, "loss": 0.10352115333080292, "num_tokens": 7638007.0, "reward": -0.125, "reward_std": 0.29398736357688904, "rewards/reward_financial_reasoning/mean": -0.125, "rewards/reward_financial_reasoning/std": 0.29398736357688904, "step": 854, "step_time": 26.51089113649141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 46.5, "completions/max_terminated_length": 46.5, "completions/mean_length": 12.4375, "completions/mean_terminated_length": 12.4375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.3811219946571683, "frac_reward_zero_std": 0.75, "grad_norm": 8.749869346618652, "kl": 3.574433773756027, "learning_rate": 3.441365660564078e-06, "loss": 0.08661140501499176, "num_tokens": 7652638.0, "reward": -0.21249999850988388, "reward_std": 0.3803405165672302, "rewards/reward_financial_reasoning/mean": -0.21249999850988388, "rewards/reward_financial_reasoning/std": 0.3803405165672302, "step": 856, "step_time": 21.310353426000802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 59.75, "completions/clipped_ratio": 0.1875, "completions/max_length": 142.5, "completions/max_terminated_length": 18.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 12.912500143051147, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.38201246660730187, "frac_reward_zero_std": 0.75, "grad_norm": 0.19118884205818176, "kl": 3.842833936214447, "learning_rate": 3.4364176150420585e-06, "loss": 0.18493984639644623, "num_tokens": 7669394.0, "reward": 0.23749998770654202, "reward_std": 0.38225453346967697, "rewards/reward_financial_reasoning/mean": 0.23749998770654202, "rewards/reward_financial_reasoning/std": 0.3822545036673546, "step": 858, "step_time": 48.92800594100481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.875, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 13.875, "completions/mean_terminated_length": 13.875, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.38290293855743546, "frac_reward_zero_std": 1.0, "grad_norm": 0.16126124560832977, "kl": 4.304675847291946, "learning_rate": 3.4314695695200403e-06, "loss": 0.1495298147201538, "num_tokens": 7687552.0, "reward": 0.3499999865889549, "reward_std": 0.21380899101495743, "rewards/reward_financial_reasoning/mean": 0.3499999865889549, "rewards/reward_financial_reasoning/std": 0.21380899101495743, "step": 860, "step_time": 17.01638667500447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 9.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 14.5, "completions/max_terminated_length": 14.5, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 9.6875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.383793410507569, "frac_reward_zero_std": 1.0, "grad_norm": 0.17267750203609467, "kl": 4.147587463259697, "learning_rate": 3.4265215239980208e-06, "loss": 0.15131743252277374, "num_tokens": 7699363.0, "reward": 0.5249999910593033, "reward_std": 0.40089183300733566, "rewards/reward_financial_reasoning/mean": 0.5249999910593033, "rewards/reward_financial_reasoning/std": 0.40089183300733566, "step": 862, "step_time": 10.703602764002426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 58.0625, "completions/clipped_ratio": 0.125, "completions/max_length": 136.5, "completions/max_terminated_length": 128.0, "completions/mean_length": 58.0625, "completions/mean_terminated_length": 32.1875, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.3846838824577026, "frac_reward_zero_std": 0.75, "grad_norm": 0.8160765767097473, "kl": 2.1349672228097916, "learning_rate": 3.4215734784760026e-06, "loss": 0.06090102344751358, "num_tokens": 7719948.0, "reward": 0.23750001192092896, "reward_std": 0.5119454711675644, "rewards/reward_financial_reasoning/mean": 0.23750001192092896, "rewards/reward_financial_reasoning/std": 0.5119454860687256, "step": 864, "step_time": 51.53723118200287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.75, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 28.75, "completions/mean_terminated_length": 13.500000476837158, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.38557435440783616, "frac_reward_zero_std": 1.0, "grad_norm": 0.14556773006916046, "kl": 2.87981840968132, "learning_rate": 3.416625432953983e-06, "loss": 0.10689441114664078, "num_tokens": 7741352.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 866, "step_time": 51.90839271850564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 61.0625, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 61.0625, "completions/mean_terminated_length": 16.321428775787354, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.38646482635796975, "frac_reward_zero_std": 0.5, "grad_norm": 2.3730831146240234, "kl": 1.4520172700285912, "learning_rate": 3.411677387431965e-06, "loss": 0.2352294623851776, "num_tokens": 7761969.0, "reward": 0.3125000149011612, "reward_std": 0.49342095851898193, "rewards/reward_financial_reasoning/mean": 0.3125000149011612, "rewards/reward_financial_reasoning/std": 0.4934210181236267, "step": 868, "step_time": 84.79485128700253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 29.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 68.5, "completions/max_terminated_length": 68.5, "completions/mean_length": 29.6875, "completions/mean_terminated_length": 29.6875, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.3873552983081033, "frac_reward_zero_std": 1.0, "grad_norm": 0.8227584958076477, "kl": 3.320344388484955, "learning_rate": 3.406729341909946e-06, "loss": 0.11288516968488693, "num_tokens": 7775516.0, "reward": 0.4000000059604645, "reward_std": 0.5345224589109421, "rewards/reward_financial_reasoning/mean": 0.4000000059604645, "rewards/reward_financial_reasoning/std": 0.5345224738121033, "step": 870, "step_time": 26.270858613999735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 24.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 28.5, "completions/max_terminated_length": 28.5, "completions/mean_length": 24.9375, "completions/mean_terminated_length": 24.9375, "completions/min_length": 20.5, "completions/min_terminated_length": 20.5, "epoch": 0.38824577025823687, "frac_reward_zero_std": 1.0, "grad_norm": 0.056442953646183014, "kl": 1.64863820374012, "learning_rate": 3.4017812963879272e-06, "loss": 0.06417623907327652, "num_tokens": 7790747.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 872, "step_time": 16.352528546009125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 42.8125, "completions/clipped_ratio": 0.125, "completions/max_length": 138.5, "completions/max_terminated_length": 17.5, "completions/mean_length": 42.8125, "completions/mean_terminated_length": 11.791666746139526, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.38913624220837045, "frac_reward_zero_std": 1.0, "grad_norm": 0.22031089663505554, "kl": 2.7123608253896236, "learning_rate": 3.396833250865908e-06, "loss": 0.10119879245758057, "num_tokens": 7803032.0, "reward": 0.37500000558793545, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.37500000558793545, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 874, "step_time": 44.841470889492484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 25.125, "completions/clipped_ratio": 0.0, "completions/max_length": 31.5, "completions/max_terminated_length": 31.5, "completions/mean_length": 25.125, "completions/mean_terminated_length": 25.125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.390026714158504, "frac_reward_zero_std": 1.0, "grad_norm": 0.15465430915355682, "kl": 2.474703371524811, "learning_rate": 3.3918852053438895e-06, "loss": 0.09784232825040817, "num_tokens": 7823546.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 876, "step_time": 20.668088107500807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.375, "completions/clipped_ratio": 0.0, "completions/max_length": 48.5, "completions/max_terminated_length": 48.5, "completions/mean_length": 20.375, "completions/mean_terminated_length": 20.375, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.39091718610863757, "frac_reward_zero_std": 1.0, "grad_norm": 0.2885158956050873, "kl": 2.143370568752289, "learning_rate": 3.386937159821871e-06, "loss": 0.08493253588676453, "num_tokens": 7845432.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 878, "step_time": 26.40964586049813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 20.4375, "completions/mean_terminated_length": 20.4375, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.39180765805877116, "frac_reward_zero_std": 1.0, "grad_norm": 0.3435024321079254, "kl": 3.9613396525382996, "learning_rate": 3.381989114299852e-06, "loss": 0.14171364903450012, "num_tokens": 7864639.0, "reward": -0.07500000298023224, "reward_std": 0.34743960946798325, "rewards/reward_financial_reasoning/mean": -0.07500000298023224, "rewards/reward_financial_reasoning/std": 0.34743960946798325, "step": 880, "step_time": 21.07708440350325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.5, "completions/max_terminated_length": 59.5, "completions/mean_length": 19.0, "completions/mean_terminated_length": 19.0, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.39269813000890474, "frac_reward_zero_std": 1.0, "grad_norm": 0.30529144406318665, "kl": 2.770953595638275, "learning_rate": 3.3770410687778332e-06, "loss": 0.1095418706536293, "num_tokens": 7881399.0, "reward": 0.1250000037252903, "reward_std": 0.34743958711624146, "rewards/reward_financial_reasoning/mean": 0.1250000037252903, "rewards/reward_financial_reasoning/std": 0.34743958711624146, "step": 882, "step_time": 26.48739288449724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 48.0625, "completions/clipped_ratio": 0.125, "completions/max_length": 138.5, "completions/max_terminated_length": 36.0, "completions/mean_length": 48.0625, "completions/mean_terminated_length": 17.9375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3935886019590383, "frac_reward_zero_std": 0.75, "grad_norm": 2.556560516357422, "kl": 4.03894579410553, "learning_rate": 3.372093023255814e-06, "loss": 0.06743745505809784, "num_tokens": 7902416.0, "reward": -0.0624999962747097, "reward_std": 0.219983771443367, "rewards/reward_financial_reasoning/mean": -0.0624999962747097, "rewards/reward_financial_reasoning/std": 0.219983771443367, "step": 884, "step_time": 51.95550639049543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 96.5, "completions/max_terminated_length": 96.5, "completions/mean_length": 34.5625, "completions/mean_terminated_length": 34.5625, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.39447907390917186, "frac_reward_zero_std": 1.0, "grad_norm": 5.515413761138916, "kl": 2.448286272585392, "learning_rate": 3.3671449777337956e-06, "loss": 0.09613915532827377, "num_tokens": 7921649.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 886, "step_time": 38.43645350350198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 50.8125, "completions/clipped_ratio": 0.125, "completions/max_length": 135.5, "completions/max_terminated_length": 74.5, "completions/mean_length": 50.8125, "completions/mean_terminated_length": 22.9375, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.39536954585930545, "frac_reward_zero_std": 1.0, "grad_norm": 0.16039247810840607, "kl": 2.114430546760559, "learning_rate": 3.3621969322117765e-06, "loss": 0.06777290999889374, "num_tokens": 7944982.0, "reward": -0.1249999962747097, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": -0.1249999962747097, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 888, "step_time": 52.79890574550154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 21.0625, "completions/mean_terminated_length": 21.0625, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.396260017809439, "frac_reward_zero_std": 1.0, "grad_norm": 0.21837559342384338, "kl": 1.9849668145179749, "learning_rate": 3.357248886689758e-06, "loss": 0.07559309899806976, "num_tokens": 7956239.0, "reward": 0.424999987706542, "reward_std": 0.13363061845302582, "rewards/reward_financial_reasoning/mean": 0.424999987706542, "rewards/reward_financial_reasoning/std": 0.13363061845302582, "step": 890, "step_time": 25.786644731997512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 18.8125, "completions/mean_terminated_length": 18.8125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.39715048975957257, "frac_reward_zero_std": 0.5, "grad_norm": 11.93359088897705, "kl": 2.411112889647484, "learning_rate": 3.352300841167739e-06, "loss": 0.003915680572390556, "num_tokens": 7974396.0, "reward": 0.13750000670552254, "reward_std": 0.26692694425582886, "rewards/reward_financial_reasoning/mean": 0.13750000670552254, "rewards/reward_financial_reasoning/std": 0.26692697405815125, "step": 892, "step_time": 17.294510296509543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 11.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.3125, "completions/mean_terminated_length": 11.3125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.39804096170970615, "frac_reward_zero_std": 0.75, "grad_norm": 7.600701332092285, "kl": 2.586108446121216, "learning_rate": 3.34735279564572e-06, "loss": 0.07793588191270828, "num_tokens": 7987977.0, "reward": 0.1375000085681677, "reward_std": 0.28327932208776474, "rewards/reward_financial_reasoning/mean": 0.1375000085681677, "rewards/reward_financial_reasoning/std": 0.2832793518900871, "step": 894, "step_time": 11.401168355005211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.39893143365983974, "frac_reward_zero_std": 1.0, "grad_norm": 0.44599446654319763, "kl": 2.8776747286319733, "learning_rate": 3.342404750123701e-06, "loss": 0.11476292461156845, "num_tokens": 8004293.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 896, "step_time": 17.037187163503404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.375, "completions/clipped_ratio": 0.0625, "completions/max_length": 141.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 31.375, "completions/mean_terminated_length": 16.321428775787354, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.39982190560997327, "frac_reward_zero_std": 1.0, "grad_norm": 0.2007230818271637, "kl": 3.3882580399513245, "learning_rate": 3.3374567046016825e-06, "loss": 0.11634482443332672, "num_tokens": 8022299.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 898, "step_time": 50.946265338003286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.875, "completions/clipped_ratio": 0.0, "completions/max_length": 57.5, "completions/max_terminated_length": 57.5, "completions/mean_length": 21.875, "completions/mean_terminated_length": 21.875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.40071237756010686, "frac_reward_zero_std": 0.75, "grad_norm": 2.8853933811187744, "kl": 2.4549517929553986, "learning_rate": 3.332508659079664e-06, "loss": 0.1362210363149643, "num_tokens": 8040041.0, "reward": 0.07499999925494194, "reward_std": 0.0707106813788414, "rewards/reward_financial_reasoning/mean": 0.07499999925494194, "rewards/reward_financial_reasoning/std": 0.0707106813788414, "step": 900, "step_time": 25.934856780499103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.0625, "completions/clipped_ratio": 0.0625, "completions/max_length": 143.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 27.0625, "completions/mean_terminated_length": 11.500000238418579, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.40160284951024044, "frac_reward_zero_std": 0.75, "grad_norm": 9.213332176208496, "kl": 5.65405336022377, "learning_rate": 3.327560613557645e-06, "loss": 0.2474951297044754, "num_tokens": 8067754.0, "reward": -0.17500000074505806, "reward_std": 0.39675553888082504, "rewards/reward_financial_reasoning/mean": -0.17500000074505806, "rewards/reward_financial_reasoning/std": 0.39675553888082504, "step": 902, "step_time": 58.80656282400014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.8125, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.5, "completions/max_terminated_length": 19.5, "completions/mean_length": 30.8125, "completions/mean_terminated_length": 15.892857551574707, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.402493321460374, "frac_reward_zero_std": 0.75, "grad_norm": 4.881185054779053, "kl": 2.6317369118332863, "learning_rate": 3.3226125680356262e-06, "loss": 0.18388396501541138, "num_tokens": 8085895.0, "reward": 0.1375000085681677, "reward_std": 0.28327932208776474, "rewards/reward_financial_reasoning/mean": 0.1375000085681677, "rewards/reward_financial_reasoning/std": 0.2832793518900871, "step": 904, "step_time": 49.99714060999759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 21.4375, "completions/clipped_ratio": 0.0, "completions/max_length": 28.5, "completions/max_terminated_length": 28.5, "completions/mean_length": 21.4375, "completions/mean_terminated_length": 21.4375, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.40338379341050756, "frac_reward_zero_std": 0.75, "grad_norm": 0.09603514522314072, "kl": 3.4366951882839203, "learning_rate": 3.317664522513607e-06, "loss": 0.14981061220169067, "num_tokens": 8108566.0, "reward": 0.11250000074505806, "reward_std": 0.27998724579811096, "rewards/reward_financial_reasoning/mean": 0.11250000074505806, "rewards/reward_financial_reasoning/std": 0.27998724579811096, "step": 906, "step_time": 21.12849767700027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.75, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 17.75, "completions/mean_terminated_length": 17.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.40427426536064115, "frac_reward_zero_std": 1.0, "grad_norm": 0.22057682275772095, "kl": 1.7570404410362244, "learning_rate": 3.3127164769915886e-06, "loss": 0.06977805495262146, "num_tokens": 8123674.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 908, "step_time": 15.24695787949895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.40516473731077474, "frac_reward_zero_std": 1.0, "grad_norm": 1.6793663501739502, "kl": 2.9984846711158752, "learning_rate": 3.3077684314695695e-06, "loss": 0.10924842953681946, "num_tokens": 8143346.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 910, "step_time": 17.416411400998186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 34.9375, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 34.9375, "completions/mean_terminated_length": 34.9375, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.40605520926090827, "frac_reward_zero_std": 0.75, "grad_norm": 0.6713170409202576, "kl": 2.7019808292388916, "learning_rate": 3.302820385947551e-06, "loss": 0.15788856148719788, "num_tokens": 8167513.0, "reward": 0.23750000912696123, "reward_std": 0.2931488901376724, "rewards/reward_financial_reasoning/mean": 0.23750000912696123, "rewards/reward_financial_reasoning/std": 0.2931489050388336, "step": 912, "step_time": 50.308363749998534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 26.375, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 26.375, "completions/mean_terminated_length": 26.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.40694568121104185, "frac_reward_zero_std": 0.75, "grad_norm": 3.441307544708252, "kl": 3.6447712928056717, "learning_rate": 3.297872340425532e-06, "loss": 0.23996052145957947, "num_tokens": 8187695.0, "reward": 0.19999999552965164, "reward_std": 0.41815026104450226, "rewards/reward_financial_reasoning/mean": 0.19999999552965164, "rewards/reward_financial_reasoning/std": 0.41815026849508286, "step": 914, "step_time": 29.07546422100131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.375, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 15.375, "completions/mean_terminated_length": 15.375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.40783615316117544, "frac_reward_zero_std": 1.0, "grad_norm": 0.24823780357837677, "kl": 4.964463084936142, "learning_rate": 3.292924294903513e-06, "loss": 0.17128558456897736, "num_tokens": 8200701.0, "reward": 0.20000000670552254, "reward_std": 0.26726123690605164, "rewards/reward_financial_reasoning/mean": 0.20000000670552254, "rewards/reward_financial_reasoning/std": 0.26726123690605164, "step": 916, "step_time": 19.823334593500476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.5, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.5, "completions/max_terminated_length": 19.0, "completions/mean_length": 31.5, "completions/mean_terminated_length": 16.383928775787354, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.40872662511130897, "frac_reward_zero_std": 1.0, "grad_norm": 0.31193023920059204, "kl": 2.5705791860818863, "learning_rate": 3.287976249381495e-06, "loss": 0.1012052446603775, "num_tokens": 8218805.0, "reward": 0.17500000074505806, "reward_std": 0.29398736357688904, "rewards/reward_financial_reasoning/mean": 0.17500000074505806, "rewards/reward_financial_reasoning/std": 0.29398736357688904, "step": 918, "step_time": 50.4936487595005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.5, "completions/clipped_ratio": 0.0, "completions/max_length": 27.5, "completions/max_terminated_length": 27.5, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 19.5, "completions/min_terminated_length": 19.5, "epoch": 0.40961709706144256, "frac_reward_zero_std": 1.0, "grad_norm": 0.12017732858657837, "kl": 2.4508478343486786, "learning_rate": 3.283028203859476e-06, "loss": 0.09826779365539551, "num_tokens": 8239765.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 920, "step_time": 19.770568174495565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 37.5, "completions/max_terminated_length": 37.5, "completions/mean_length": 17.8125, "completions/mean_terminated_length": 17.8125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.41050756901157615, "frac_reward_zero_std": 0.75, "grad_norm": 147.41748046875, "kl": 5.577771902084351, "learning_rate": 3.2780801583374573e-06, "loss": 0.3650239408016205, "num_tokens": 8259626.0, "reward": 0.4749999865889549, "reward_std": 0.12416292726993561, "rewards/reward_financial_reasoning/mean": 0.4749999865889549, "rewards/reward_financial_reasoning/std": 0.12416292726993561, "step": 922, "step_time": 22.212593034008023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 15.125, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 15.125, "completions/mean_terminated_length": 15.125, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.41139804096170973, "frac_reward_zero_std": 1.0, "grad_norm": 0.19488407671451569, "kl": 2.578184872865677, "learning_rate": 3.2731321128154383e-06, "loss": 0.10070281475782394, "num_tokens": 8276492.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 924, "step_time": 14.635534849996475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 35.8125, "completions/clipped_ratio": 0.0625, "completions/max_length": 141.0, "completions/max_terminated_length": 23.5, "completions/mean_length": 35.8125, "completions/mean_terminated_length": 20.964285850524902, "completions/min_length": 18.5, "completions/min_terminated_length": 18.5, "epoch": 0.41228851291184326, "frac_reward_zero_std": 1.0, "grad_norm": 0.15457630157470703, "kl": 2.4652615785598755, "learning_rate": 3.2681840672934196e-06, "loss": 0.09598159044981003, "num_tokens": 8288481.0, "reward": 0.20000000670552254, "reward_std": 0.26726123690605164, "rewards/reward_financial_reasoning/mean": 0.20000000670552254, "rewards/reward_financial_reasoning/std": 0.26726123690605164, "step": 926, "step_time": 45.35208458950365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 30.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 61.5, "completions/max_terminated_length": 61.5, "completions/mean_length": 30.6875, "completions/mean_terminated_length": 30.6875, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.41317898486197685, "frac_reward_zero_std": 0.75, "grad_norm": 0.5829043388366699, "kl": 2.8620926439762115, "learning_rate": 3.2632360217714006e-06, "loss": 0.2158636450767517, "num_tokens": 8306204.0, "reward": -0.10000000335276127, "reward_std": 0.21905138343572617, "rewards/reward_financial_reasoning/mean": -0.10000000335276127, "rewards/reward_financial_reasoning/std": 0.21905139833688736, "step": 928, "step_time": 27.723770248005167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 16.8125, "completions/mean_terminated_length": 16.8125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.41406945681211044, "frac_reward_zero_std": 0.75, "grad_norm": 0.07053980231285095, "kl": 3.5780192017555237, "learning_rate": 3.258287976249382e-06, "loss": 0.20760619640350342, "num_tokens": 8319833.0, "reward": 0.17500001192092896, "reward_std": 0.472439780831337, "rewards/reward_financial_reasoning/mean": 0.17500001192092896, "rewards/reward_financial_reasoning/std": 0.47243979573249817, "step": 930, "step_time": 15.179649859499477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.3125, "completions/clipped_ratio": 0.125, "completions/max_length": 133.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 41.3125, "completions/mean_terminated_length": 11.229166984558105, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.41495992876224397, "frac_reward_zero_std": 0.75, "grad_norm": 0.9923263788223267, "kl": 4.27918504178524, "learning_rate": 3.253339930727363e-06, "loss": 0.2304915338754654, "num_tokens": 8340382.0, "reward": 0.46249998826533556, "reward_std": 0.159518264234066, "rewards/reward_financial_reasoning/mean": 0.46249998826533556, "rewards/reward_financial_reasoning/std": 0.159518264234066, "step": 932, "step_time": 50.51667185199767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.75, "completions/clipped_ratio": 0.0, "completions/max_length": 34.5, "completions/max_terminated_length": 34.5, "completions/mean_length": 19.75, "completions/mean_terminated_length": 19.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.41585040071237755, "frac_reward_zero_std": 1.0, "grad_norm": 0.15250875055789948, "kl": 2.793649807572365, "learning_rate": 3.2483918852053443e-06, "loss": 0.09913206100463867, "num_tokens": 8359162.0, "reward": 0.1250000074505806, "reward_std": 0.34743961691856384, "rewards/reward_financial_reasoning/mean": 0.1250000074505806, "rewards/reward_financial_reasoning/std": 0.34743963181972504, "step": 934, "step_time": 20.64623363999999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 51.3125, "completions/clipped_ratio": 0.125, "completions/max_length": 140.5, "completions/max_terminated_length": 34.5, "completions/mean_length": 51.3125, "completions/mean_terminated_length": 21.75, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.41674087266251114, "frac_reward_zero_std": 1.0, "grad_norm": 0.18754097819328308, "kl": 1.8101801723241806, "learning_rate": 3.2434438396833252e-06, "loss": 0.05962216854095459, "num_tokens": 8383911.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 936, "step_time": 55.66111214999546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 33.875, "completions/clipped_ratio": 0.0625, "completions/max_length": 155.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.875, "completions/mean_terminated_length": 18.48214292526245, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.41763134461264473, "frac_reward_zero_std": 0.75, "grad_norm": 4.2415056228637695, "kl": 2.8781251162290573, "learning_rate": 3.2384957941613066e-06, "loss": 0.0792328417301178, "num_tokens": 8398349.0, "reward": 0.4499999862164259, "reward_std": 0.14603425562381744, "rewards/reward_financial_reasoning/mean": 0.4499999862164259, "rewards/reward_financial_reasoning/std": 0.14603426307439804, "step": 938, "step_time": 51.50570740849798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.25, "completions/clipped_ratio": 0.0, "completions/max_length": 26.5, "completions/max_terminated_length": 26.5, "completions/mean_length": 20.25, "completions/mean_terminated_length": 20.25, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.41852181656277826, "frac_reward_zero_std": 1.0, "grad_norm": 0.08914017677307129, "kl": 5.042290985584259, "learning_rate": 3.233547748639288e-06, "loss": 0.17624709010124207, "num_tokens": 8416193.0, "reward": -0.15000000596046448, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -0.15000000596046448, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 940, "step_time": 17.541959170001064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 55.4375, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.0, "completions/max_terminated_length": 132.5, "completions/mean_length": 55.4375, "completions/mean_terminated_length": 43.892860412597656, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.41941228851291185, "frac_reward_zero_std": 0.75, "grad_norm": 2.0707364082336426, "kl": 1.8768098652362823, "learning_rate": 3.228599703117269e-06, "loss": 0.08203748613595963, "num_tokens": 8438224.0, "reward": -0.01249999925494194, "reward_std": 0.18850919604301453, "rewards/reward_financial_reasoning/mean": -0.01249999925494194, "rewards/reward_financial_reasoning/std": 0.18850919604301453, "step": 942, "step_time": 52.522713671496604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.4375, "completions/clipped_ratio": 0.125, "completions/max_length": 138.0, "completions/max_terminated_length": 20.5, "completions/mean_length": 46.4375, "completions/mean_terminated_length": 16.6875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.42030276046304543, "frac_reward_zero_std": 0.75, "grad_norm": 3.3209104537963867, "kl": 2.6846156250685453, "learning_rate": 3.2236516575952503e-06, "loss": 0.13027328252792358, "num_tokens": 8459887.0, "reward": 0.32500001043081284, "reward_std": 0.38347896933555603, "rewards/reward_financial_reasoning/mean": 0.32500001043081284, "rewards/reward_financial_reasoning/std": 0.3834789991378784, "step": 944, "step_time": 52.64768808199733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 16.6875, "completions/mean_terminated_length": 16.6875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.42119323241317896, "frac_reward_zero_std": 1.0, "grad_norm": 0.1603250652551651, "kl": 2.955336630344391, "learning_rate": 3.2187036120732313e-06, "loss": 0.1090928167104721, "num_tokens": 8482226.0, "reward": 0.1250000074505806, "reward_std": 0.34743961691856384, "rewards/reward_financial_reasoning/mean": 0.1250000074505806, "rewards/reward_financial_reasoning/std": 0.34743963181972504, "step": 946, "step_time": 19.271809785506775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.6875, "completions/clipped_ratio": 0.0, "completions/max_length": 29.5, "completions/max_terminated_length": 29.5, "completions/mean_length": 18.6875, "completions/mean_terminated_length": 18.6875, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.42208370436331255, "frac_reward_zero_std": 1.0, "grad_norm": 0.6349166631698608, "kl": 2.067967712879181, "learning_rate": 3.2137555665512126e-06, "loss": 0.07463287562131882, "num_tokens": 8501501.0, "reward": 0.30000001192092896, "reward_std": 0.5345224589109421, "rewards/reward_financial_reasoning/mean": 0.30000001192092896, "rewards/reward_financial_reasoning/std": 0.5345224738121033, "step": 948, "step_time": 19.316131935000158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.125, "completions/clipped_ratio": 0.0, "completions/max_length": 39.5, "completions/max_terminated_length": 39.5, "completions/mean_length": 23.125, "completions/mean_terminated_length": 23.125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.42297417631344614, "frac_reward_zero_std": 0.75, "grad_norm": 0.08936484903097153, "kl": 2.352201282978058, "learning_rate": 3.2088075210291936e-06, "loss": 0.05001607537269592, "num_tokens": 8516351.0, "reward": 0.5125000029802322, "reward_std": 0.3090885281562805, "rewards/reward_financial_reasoning/mean": 0.5125000029802322, "rewards/reward_financial_reasoning/std": 0.3090885281562805, "step": 950, "step_time": 19.179083563998574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 69.5, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 69.5, "completions/mean_terminated_length": 7.428571701049805, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.4238646482635797, "frac_reward_zero_std": 0.5, "grad_norm": 0.2313220351934433, "kl": 4.225497528910637, "learning_rate": 3.203859475507175e-06, "loss": 0.42357900738716125, "num_tokens": 8535511.0, "reward": 0.42499999701976776, "reward_std": 0.5846030414104462, "rewards/reward_financial_reasoning/mean": 0.42499999701976776, "rewards/reward_financial_reasoning/std": 0.5846030116081238, "step": 952, "step_time": 83.1467112555074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 17.5, "completions/clipped_ratio": 0.0, "completions/max_length": 31.5, "completions/max_terminated_length": 31.5, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.42475512021371326, "frac_reward_zero_std": 1.0, "grad_norm": 0.17937541007995605, "kl": 3.103740006685257, "learning_rate": 3.198911429985156e-06, "loss": 0.1278012990951538, "num_tokens": 8545871.0, "reward": 0.2750000059604645, "reward_std": 0.5612486004829407, "rewards/reward_financial_reasoning/mean": 0.2750000059604645, "rewards/reward_financial_reasoning/std": 0.5612486004829407, "step": 954, "step_time": 14.637012897994282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 6.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.5, "completions/max_terminated_length": 9.5, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.42564559216384684, "frac_reward_zero_std": 1.0, "grad_norm": 0.2365179806947708, "kl": 6.207745045423508, "learning_rate": 3.1939633844631373e-06, "loss": 0.22508688271045685, "num_tokens": 8560383.0, "reward": 0.5249999910593033, "reward_std": 0.40089183300733566, "rewards/reward_financial_reasoning/mean": 0.5249999910593033, "rewards/reward_financial_reasoning/std": 0.40089183300733566, "step": 956, "step_time": 10.830135092503042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 41.5625, "completions/clipped_ratio": 0.0625, "completions/max_length": 138.5, "completions/max_terminated_length": 116.5, "completions/mean_length": 41.5625, "completions/mean_terminated_length": 28.312501907348633, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.42653606411398043, "frac_reward_zero_std": 0.75, "grad_norm": 0.7334434390068054, "kl": 2.0300208553671837, "learning_rate": 3.1890153389411182e-06, "loss": 0.20082223415374756, "num_tokens": 8573904.0, "reward": 0.0, "reward_std": 0.35675284266471863, "rewards/reward_financial_reasoning/mean": 0.0, "rewards/reward_financial_reasoning/std": 0.3567528575658798, "step": 958, "step_time": 47.42274022550191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 20.5, "completions/clipped_ratio": 0.0, "completions/max_length": 32.5, "completions/max_terminated_length": 32.5, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.42742653606411396, "frac_reward_zero_std": 1.0, "grad_norm": 0.19361354410648346, "kl": 2.600453108549118, "learning_rate": 3.1840672934190996e-06, "loss": 0.09075171500444412, "num_tokens": 8595496.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 960, "step_time": 21.33384811499127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 16.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 45.5, "completions/max_terminated_length": 45.5, "completions/mean_length": 16.8125, "completions/mean_terminated_length": 16.8125, "completions/min_length": 11.5, "completions/min_terminated_length": 11.5, "epoch": 0.42831700801424755, "frac_reward_zero_std": 1.0, "grad_norm": 0.20128197968006134, "kl": 2.412764996290207, "learning_rate": 3.179119247897081e-06, "loss": 0.0916946604847908, "num_tokens": 8610477.0, "reward": 0.02500000037252903, "reward_std": 0.08017837256193161, "rewards/reward_financial_reasoning/mean": 0.02500000037252903, "rewards/reward_financial_reasoning/std": 0.08017837256193161, "step": 962, "step_time": 20.371279284001503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 40.375, "completions/clipped_ratio": 0.125, "completions/max_length": 138.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 40.375, "completions/mean_terminated_length": 9.270833492279053, "completions/min_length": 5.5, "completions/min_terminated_length": 5.5, "epoch": 0.42920747996438113, "frac_reward_zero_std": 1.0, "grad_norm": 0.11826738715171814, "kl": 3.3262559920549393, "learning_rate": 3.174171202375062e-06, "loss": 0.11317823082208633, "num_tokens": 8630323.0, "reward": 0.025000005960464478, "reward_std": 0.24053511023521423, "rewards/reward_financial_reasoning/mean": 0.025000005960464478, "rewards/reward_financial_reasoning/std": 0.24053512513637543, "step": 964, "step_time": 49.96986727849435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 36.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 135.5, "completions/max_terminated_length": 135.5, "completions/mean_length": 36.8125, "completions/mean_terminated_length": 36.8125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.4300979519145147, "frac_reward_zero_std": 0.5, "grad_norm": 5.354856967926025, "kl": 2.3516476303339005, "learning_rate": 3.1692231568530433e-06, "loss": 0.2586827874183655, "num_tokens": 8647824.0, "reward": 0.1875000074505806, "reward_std": 0.27381163090467453, "rewards/reward_financial_reasoning/mean": 0.1875000074505806, "rewards/reward_financial_reasoning/std": 0.2738116607069969, "step": 966, "step_time": 46.45492545999514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 27.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 29.5, "completions/max_terminated_length": 29.5, "completions/mean_length": 27.3125, "completions/mean_terminated_length": 27.3125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.43098842386464825, "frac_reward_zero_std": 1.0, "grad_norm": 0.06150537729263306, "kl": 1.8199052214622498, "learning_rate": 3.1642751113310242e-06, "loss": 0.07260263711214066, "num_tokens": 8667645.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/reward_financial_reasoning/mean": 0.10000000149011612, "rewards/reward_financial_reasoning/std": 0.0, "step": 968, "step_time": 19.451285248505883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 76.125, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 124.5, "completions/mean_length": 76.125, "completions/mean_terminated_length": 35.178571701049805, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.43187889581478184, "frac_reward_zero_std": 0.75, "grad_norm": 3.2838754653930664, "kl": 1.8590649515390396, "learning_rate": 3.159327065809006e-06, "loss": 0.2001737803220749, "num_tokens": 8683503.0, "reward": 0.32500001043081284, "reward_std": 0.38347896933555603, "rewards/reward_financial_reasoning/mean": 0.32500001043081284, "rewards/reward_financial_reasoning/std": 0.3834789991378784, "step": 970, "step_time": 78.57448822999868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 45.5625, "completions/clipped_ratio": 0.125, "completions/max_length": 154.5, "completions/max_terminated_length": 32.0, "completions/mean_length": 45.5625, "completions/mean_terminated_length": 14.895833492279053, "completions/min_length": 7.5, "completions/min_terminated_length": 7.5, "epoch": 0.4327693677649154, "frac_reward_zero_std": 0.75, "grad_norm": 0.10290610790252686, "kl": 3.8507900685071945, "learning_rate": 3.1543790202869866e-06, "loss": 0.2706332206726074, "num_tokens": 8692640.0, "reward": 0.25000000558793545, "reward_std": 0.31163340061903, "rewards/reward_financial_reasoning/mean": 0.25000000558793545, "rewards/reward_financial_reasoning/std": 0.31163340061903, "step": 972, "step_time": 46.23696361999828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 19.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 25.5, "completions/max_terminated_length": 25.5, "completions/mean_length": 19.5625, "completions/mean_terminated_length": 19.5625, "completions/min_length": 13.5, "completions/min_terminated_length": 13.5, "epoch": 0.43365983971504896, "frac_reward_zero_std": 1.0, "grad_norm": 0.1872958242893219, "kl": 2.4179421216249466, "learning_rate": 3.1494309747649684e-06, "loss": 0.09821489453315735, "num_tokens": 8713617.0, "reward": 0.2750000096857548, "reward_std": 0.18708287179470062, "rewards/reward_financial_reasoning/mean": 0.2750000096857548, "rewards/reward_financial_reasoning/std": 0.18708288669586182, "step": 974, "step_time": 18.739151772500918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 12.0625, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 12.0625, "completions/mean_terminated_length": 12.0625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.43455031166518254, "frac_reward_zero_std": 1.0, "grad_norm": 1.4092589616775513, "kl": 3.115272670984268, "learning_rate": 3.144482929242949e-06, "loss": 0.1181776374578476, "num_tokens": 8734858.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 976, "step_time": 18.696438336995925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 18.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 22.5, "completions/max_terminated_length": 22.5, "completions/mean_length": 18.8125, "completions/mean_terminated_length": 18.8125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.43544078361531613, "frac_reward_zero_std": 1.0, "grad_norm": 0.09314737468957901, "kl": 1.7916912287473679, "learning_rate": 3.1395348837209307e-06, "loss": 0.06952106952667236, "num_tokens": 8750415.0, "reward": 0.1250000074505806, "reward_std": 0.34743961691856384, "rewards/reward_financial_reasoning/mean": 0.1250000074505806, "rewards/reward_financial_reasoning/std": 0.34743963181972504, "step": 978, "step_time": 14.855361223002546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 22.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.5, "completions/max_terminated_length": 49.5, "completions/mean_length": 22.0, "completions/mean_terminated_length": 22.0, "completions/min_length": 10.5, "completions/min_terminated_length": 10.5, "epoch": 0.4363312555654497, "frac_reward_zero_std": 0.75, "grad_norm": 0.07393768429756165, "kl": 3.0034860372543335, "learning_rate": 3.134586838198912e-06, "loss": 0.12021099776029587, "num_tokens": 8767855.0, "reward": 0.46250002086162567, "reward_std": 0.39018382132053375, "rewards/reward_financial_reasoning/mean": 0.46250002086162567, "rewards/reward_financial_reasoning/std": 0.39018386602401733, "step": 980, "step_time": 23.458823763998225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.8125, "completions/clipped_ratio": 0.0, "completions/max_length": 24.5, "completions/max_terminated_length": 24.5, "completions/mean_length": 14.8125, "completions/mean_terminated_length": 14.8125, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.43722172751558325, "frac_reward_zero_std": 1.0, "grad_norm": 0.14612756669521332, "kl": 3.3168026506900787, "learning_rate": 3.129638792676893e-06, "loss": 0.13220839202404022, "num_tokens": 8787948.0, "reward": -0.07499999925494194, "reward_std": 0.026726126670837402, "rewards/reward_financial_reasoning/mean": -0.07499999925494194, "rewards/reward_financial_reasoning/std": 0.026726126670837402, "step": 982, "step_time": 18.28132488799747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 47.125, "completions/clipped_ratio": 0.0, "completions/max_length": 138.5, "completions/max_terminated_length": 138.5, "completions/mean_length": 47.125, "completions/mean_terminated_length": 47.125, "completions/min_length": 9.5, "completions/min_terminated_length": 9.5, "epoch": 0.43811219946571683, "frac_reward_zero_std": 1.0, "grad_norm": 0.17604397237300873, "kl": 11.56613752245903, "learning_rate": 3.1246907471548744e-06, "loss": 0.22730720043182373, "num_tokens": 8807486.0, "reward": 0.2000000085681677, "reward_std": 0.26726124435663223, "rewards/reward_financial_reasoning/mean": 0.2000000085681677, "rewards/reward_financial_reasoning/std": 0.2672612592577934, "step": 984, "step_time": 49.57627448099811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 28.5625, "completions/clipped_ratio": 0.0, "completions/max_length": 89.5, "completions/max_terminated_length": 89.5, "completions/mean_length": 28.5625, "completions/mean_terminated_length": 28.5625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4390026714158504, "frac_reward_zero_std": 0.75, "grad_norm": 2.47641658782959, "kl": 2.2521740794181824, "learning_rate": 3.1197427016328553e-06, "loss": 0.14822399616241455, "num_tokens": 8828775.0, "reward": 0.07499999925494194, "reward_std": 0.0707106813788414, "rewards/reward_financial_reasoning/mean": 0.07499999925494194, "rewards/reward_financial_reasoning/std": 0.0707106813788414, "step": 986, "step_time": 37.008812667496386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 14.3125, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 14.3125, "completions/mean_terminated_length": 14.3125, "completions/min_length": 6.5, "completions/min_terminated_length": 6.5, "epoch": 0.43989314336598395, "frac_reward_zero_std": 0.75, "grad_norm": 0.25839534401893616, "kl": 6.1503177136182785, "learning_rate": 3.1147946561108367e-06, "loss": 0.19753281772136688, "num_tokens": 8850516.0, "reward": 0.5750000178813934, "reward_std": 0.49124836921691895, "rewards/reward_financial_reasoning/mean": 0.5750000178813934, "rewards/reward_financial_reasoning/std": 0.4912484139204025, "step": 988, "step_time": 18.673066382496472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 13.625, "completions/clipped_ratio": 0.0, "completions/max_length": 20.5, "completions/max_terminated_length": 20.5, "completions/mean_length": 13.625, "completions/mean_terminated_length": 13.625, "completions/min_length": 8.5, "completions/min_terminated_length": 8.5, "epoch": 0.44078361531611754, "frac_reward_zero_std": 1.0, "grad_norm": 0.7419772148132324, "kl": 3.7723660320043564, "learning_rate": 3.1098466105888177e-06, "loss": 0.12690135836601257, "num_tokens": 8871710.0, "reward": 0.32500000670552254, "reward_std": 0.24053511023521423, "rewards/reward_financial_reasoning/mean": 0.32500000670552254, "rewards/reward_financial_reasoning/std": 0.24053512513637543, "step": 990, "step_time": 17.83083552050084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 23.25, "completions/clipped_ratio": 0.0, "completions/max_length": 34.5, "completions/max_terminated_length": 34.5, "completions/mean_length": 23.25, "completions/mean_terminated_length": 23.25, "completions/min_length": 12.5, "completions/min_terminated_length": 12.5, "epoch": 0.4416740872662511, "frac_reward_zero_std": 1.0, "grad_norm": 1.1373910903930664, "kl": 2.4735867977142334, "learning_rate": 3.104898565066799e-06, "loss": 0.09462883323431015, "num_tokens": 8890818.0, "reward": -0.05000000074505806, "reward_std": 0.16035674512386322, "rewards/reward_financial_reasoning/mean": -0.05000000074505806, "rewards/reward_financial_reasoning/std": 0.16035674512386322, "step": 992, "step_time": 20.594509614002163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 37.5, "completions/clipped_ratio": 0.0, "completions/max_length": 102.5, "completions/max_terminated_length": 102.5, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 17.5, "completions/min_terminated_length": 17.5, "epoch": 0.44256455921638466, "frac_reward_zero_std": 1.0, "grad_norm": 0.07724303007125854, "kl": 2.4766226708889008, "learning_rate": 3.09995051954478e-06, "loss": 0.09341251105070114, "num_tokens": 8907066.0, "reward": -3.725290298461914e-09, "reward_std": 0.10690449923276901, "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, "rewards/reward_financial_reasoning/std": 0.10690450668334961, "step": 994, "step_time": 36.1086978409985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 46.5625, "completions/clipped_ratio": 0.125, "completions/max_length": 142.5, "completions/max_terminated_length": 24.0, "completions/mean_length": 46.5625, "completions/mean_terminated_length": 15.666666984558105, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.44345503116651824, "frac_reward_zero_std": 0.75, "grad_norm": 0.3874448537826538, "kl": 2.218121826648712, "learning_rate": 3.0950024740227614e-06, "loss": 0.17174197733402252, "num_tokens": 8922219.0, "reward": 0.32500000670552254, "reward_std": 0.2314550280570984, "rewards/reward_financial_reasoning/mean": 0.32500000670552254, "rewards/reward_financial_reasoning/std": 0.2314550280570984, "step": 996, "step_time": 45.69451289299468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 31.3125, "completions/clipped_ratio": 0.0625, "completions/max_length": 136.5, "completions/max_terminated_length": 28.5, "completions/mean_length": 31.3125, "completions/mean_terminated_length": 16.54464340209961, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.44434550311665183, "frac_reward_zero_std": 0.75, "grad_norm": 0.13446766138076782, "kl": 3.6593779623508453, "learning_rate": 3.0900544285007423e-06, "loss": 0.1636892408132553, "num_tokens": 8938832.0, "reward": 0.22500000894069672, "reward_std": 0.434930756688118, "rewards/reward_financial_reasoning/mean": 0.22500000894069672, "rewards/reward_financial_reasoning/std": 0.4349307715892792, "step": 998, "step_time": 47.712723782995454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 56.625, "completions/clipped_ratio": 0.125, "completions/max_length": 150.0, "completions/max_terminated_length": 50.5, "completions/mean_length": 56.625, "completions/mean_terminated_length": 26.70833396911621, "completions/min_length": 15.5, "completions/min_terminated_length": 15.5, "epoch": 0.4452359750667854, "frac_reward_zero_std": 0.75, "grad_norm": 2.231067657470703, "kl": 1.4210374727845192, "learning_rate": 3.0851063829787237e-06, "loss": 0.08275322616100311, "num_tokens": 8960930.0, "reward": 0.23750001192092896, "reward_std": 0.5119454711675644, "rewards/reward_financial_reasoning/mean": 0.23750001192092896, "rewards/reward_financial_reasoning/std": 0.5119454860687256, "step": 1000, "step_time": 54.88188034250561 } ], "logging_steps": 2, "max_steps": 2246, "num_input_tokens_seen": 8960930, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }