diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4452359750667854, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.5, + "completions/max_terminated_length": 17.5, + "completions/mean_length": 10.375, + "completions/mean_terminated_length": 10.375, + "completions/min_length": 3.5, + "completions/min_terminated_length": 3.5, + "epoch": 0.0008904719501335708, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.243957996368408, + "kl": 4.46301195025444, + "learning_rate": 2.2222222222222224e-08, + "loss": 0.08426375687122345, + "num_tokens": 18262.0, + "reward": 0.21250001154839993, + "reward_std": 0.37748774141073227, + "rewards/reward_financial_reasoning/mean": 0.21250001154839993, + "rewards/reward_financial_reasoning/std": 0.37748774141073227, + "step": 2, + "step_time": 69.13269069449962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.5, + "completions/max_terminated_length": 28.5, + "completions/mean_length": 19.8125, + "completions/mean_terminated_length": 19.8125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.0017809439002671415, + "frac_reward_zero_std": 0.5, + "grad_norm": 170.77232360839844, + "kl": 3.1398727893829346, + "learning_rate": 6.666666666666668e-08, + "loss": 0.12181135267019272, + "num_tokens": 31739.0, + "reward": 0.050000001676380634, + "reward_std": 0.2702740728855133, + "rewards/reward_financial_reasoning/mean": 0.050000001676380634, + "rewards/reward_financial_reasoning/std": 0.2702740877866745, + "step": 4, + "step_time": 15.650146482497803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.0026714158504007124, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4621564149856567, + "kl": 4.303169131278992, + "learning_rate": 1.1111111111111112e-07, + "loss": 0.1490020453929901, + "num_tokens": 48159.0, + "reward": -0.07500000484287739, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": -0.07500000484287739, + "rewards/reward_financial_reasoning/std": 0.18708287924528122, + "step": 6, + "step_time": 16.41489795200323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.5, + "completions/max_terminated_length": 9.5, + "completions/mean_length": 6.125, + "completions/mean_terminated_length": 6.125, + "completions/min_length": 3.5, + "completions/min_terminated_length": 3.5, + "epoch": 0.003561887800534283, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.4477624893188477, + "kl": 4.426726162433624, + "learning_rate": 1.5555555555555556e-07, + "loss": 0.1457902044057846, + "num_tokens": 62193.0, + "reward": 0.3125, + "reward_std": 0.6411882638931274, + "rewards/reward_financial_reasoning/mean": 0.3125, + "rewards/reward_financial_reasoning/std": 0.6411882936954498, + "step": 8, + "step_time": 10.300071641000613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 14.8125, + "completions/mean_terminated_length": 14.8125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.004452359750667854, + "frac_reward_zero_std": 0.75, + "grad_norm": 18.143714904785156, + "kl": 8.868210554122925, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.10136213153600693, + "num_tokens": 79262.0, + "reward": -0.15000000037252903, + "reward_std": 0.19667484611272812, + "rewards/reward_financial_reasoning/mean": -0.15000000037252903, + "rewards/reward_financial_reasoning/std": 0.19667484611272812, + "step": 10, + "step_time": 15.87173518450254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.5, + "completions/max_terminated_length": 32.5, + "completions/mean_length": 16.3125, + "completions/mean_terminated_length": 16.3125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.005342831700801425, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.6053011417388916, + "kl": 3.103500008583069, + "learning_rate": 2.444444444444445e-07, + "loss": 0.04430732876062393, + "num_tokens": 96275.0, + "reward": -0.049999999813735485, + "reward_std": 0.23070836067199707, + "rewards/reward_financial_reasoning/mean": -0.049999999813735485, + "rewards/reward_financial_reasoning/std": 0.23070836067199707, + "step": 12, + "step_time": 18.03825327550112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.006233303650934996, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.26595669984817505, + "kl": 4.701120108366013, + "learning_rate": 2.888888888888889e-07, + "loss": 0.08473779261112213, + "num_tokens": 105843.0, + "reward": 0.48749998956918716, + "reward_std": 0.2176603004336357, + "rewards/reward_financial_reasoning/mean": 0.48749998956918716, + "rewards/reward_financial_reasoning/std": 0.2176603153347969, + "step": 14, + "step_time": 26.971775608002645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 4.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7.0, + "completions/max_terminated_length": 7.0, + "completions/mean_length": 4.4375, + "completions/mean_terminated_length": 4.4375, + "completions/min_length": 1.5, + "completions/min_terminated_length": 1.5, + "epoch": 0.007123775601068566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7614744901657104, + "kl": 5.345696994656464, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.19742266833782196, + "num_tokens": 120834.0, + "reward": 0.2750000059604645, + "reward_std": 0.6681531071662903, + "rewards/reward_financial_reasoning/mean": 0.2750000059604645, + "rewards/reward_financial_reasoning/std": 0.6681531071662903, + "step": 16, + "step_time": 10.608633042498695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.008014247551202136, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.597021460533142, + "kl": 5.759613037109375, + "learning_rate": 3.777777777777778e-07, + "loss": 0.20153652131557465, + "num_tokens": 143318.0, + "reward": -0.375, + "reward_std": 0.13363061845302582, + "rewards/reward_financial_reasoning/mean": -0.375, + "rewards/reward_financial_reasoning/std": 0.13363061845302582, + "step": 18, + "step_time": 18.688278918496508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 44.3125, + "completions/clipped_ratio": 0.125, + "completions/max_length": 136.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 44.3125, + "completions/mean_terminated_length": 14.895833969116211, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.008904719501335707, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.555985450744629, + "kl": 6.408893913030624, + "learning_rate": 4.2222222222222226e-07, + "loss": 0.1307135373353958, + "num_tokens": 164947.0, + "reward": -0.12500000186264515, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.12500000186264515, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 20, + "step_time": 48.735242737997396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 23.6875, + "completions/mean_terminated_length": 23.6875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.009795191451469279, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.6187081336975098, + "kl": 3.0680589228868484, + "learning_rate": 4.666666666666667e-07, + "loss": -0.0021132836118340492, + "num_tokens": 180622.0, + "reward": -0.10000000149011612, + "reward_std": 0.24984834343194962, + "rewards/reward_financial_reasoning/mean": -0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.2498483583331108, + "step": 22, + "step_time": 19.969555340499937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 11.375, + "completions/mean_terminated_length": 11.375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.01068566340160285, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.780721187591553, + "kl": 3.334374338388443, + "learning_rate": 5.111111111111112e-07, + "loss": -0.036053549498319626, + "num_tokens": 199364.0, + "reward": 0.01250000111758709, + "reward_std": 0.30308106541633606, + "rewards/reward_financial_reasoning/mean": 0.01250000111758709, + "rewards/reward_financial_reasoning/std": 0.30308108031749725, + "step": 24, + "step_time": 16.343573131500307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 21.375, + "completions/mean_terminated_length": 21.375, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.01157613535173642, + "frac_reward_zero_std": 0.75, + "grad_norm": 15.003988265991211, + "kl": 3.3079543113708496, + "learning_rate": 5.555555555555555e-07, + "loss": 0.024946369230747223, + "num_tokens": 217602.0, + "reward": 0.03749999962747097, + "reward_std": 0.11877349019050598, + "rewards/reward_financial_reasoning/mean": 0.03749999962747097, + "rewards/reward_financial_reasoning/std": 0.11877349019050598, + "step": 26, + "step_time": 23.70613613250316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 15.375, + "completions/mean_terminated_length": 15.375, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.012466607301869992, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.009876251220703, + "kl": 4.639135271310806, + "learning_rate": 6.000000000000001e-07, + "loss": 0.03613180294632912, + "num_tokens": 236040.0, + "reward": 0.25000000558793545, + "reward_std": 0.3794081211090088, + "rewards/reward_financial_reasoning/mean": 0.25000000558793545, + "rewards/reward_financial_reasoning/std": 0.37940813601017, + "step": 28, + "step_time": 16.66078308500437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 7.0, + "completions/mean_terminated_length": 7.0, + "completions/min_length": 1.5, + "completions/min_terminated_length": 1.5, + "epoch": 0.013357079252003561, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.2118546962738037, + "kl": 6.419469177722931, + "learning_rate": 6.444444444444445e-07, + "loss": 0.22115381062030792, + "num_tokens": 248568.0, + "reward": 0.025000005960464478, + "reward_std": 0.40089186280965805, + "rewards/reward_financial_reasoning/mean": 0.025000005960464478, + "rewards/reward_financial_reasoning/std": 0.40089186280965805, + "step": 30, + "step_time": 11.650034055499418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.014247551202137132, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1544721126556396, + "kl": 3.581442207098007, + "learning_rate": 6.88888888888889e-07, + "loss": 0.13401712477207184, + "num_tokens": 263136.0, + "reward": -0.07500000484287739, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": -0.07500000484287739, + "rewards/reward_financial_reasoning/std": 0.18708287924528122, + "step": 32, + "step_time": 13.175670107999395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.5, + "completions/max_terminated_length": 10.5, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.015138023152270703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14551043510437012, + "kl": 4.411951839923859, + "learning_rate": 7.333333333333334e-07, + "loss": 0.17104172706604004, + "num_tokens": 284920.0, + "reward": -0.2749999985098839, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.2749999985098839, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 34, + "step_time": 15.403023695998854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 63.1875, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 140.5, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 63.1875, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.016028495102404273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.961294949054718, + "kl": 2.866768404841423, + "learning_rate": 7.777777777777779e-07, + "loss": 0.09989761561155319, + "num_tokens": 300419.0, + "reward": 0.12500000558793545, + "reward_std": 0.34743960946798325, + "rewards/reward_financial_reasoning/mean": 0.12500000558793545, + "rewards/reward_financial_reasoning/std": 0.34743960946798325, + "step": 36, + "step_time": 45.11628793899581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.016918967052537846, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.7937123775482178, + "kl": 3.6248140931129456, + "learning_rate": 8.222222222222223e-07, + "loss": -0.07162602245807648, + "num_tokens": 314263.0, + "reward": -0.05000000074505806, + "reward_std": 0.2777460217475891, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.2777460217475891, + "step": 38, + "step_time": 13.610250827498021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 152.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 25.125, + "completions/mean_terminated_length": 9.75, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.017809439002671415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8069584369659424, + "kl": 4.78283154964447, + "learning_rate": 8.666666666666668e-07, + "loss": 0.15486478805541992, + "num_tokens": 336865.0, + "reward": -0.2749999985098839, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.2749999985098839, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 40, + "step_time": 54.72188140000435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.5, + "completions/max_terminated_length": 32.5, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.018699910952804988, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.385245323181152, + "kl": 2.647599071264267, + "learning_rate": 9.111111111111113e-07, + "loss": 0.011303652077913284, + "num_tokens": 356229.0, + "reward": -0.03750000149011612, + "reward_std": 0.1989518627524376, + "rewards/reward_financial_reasoning/mean": -0.03750000149011612, + "rewards/reward_financial_reasoning/std": 0.1989518627524376, + "step": 42, + "step_time": 19.585943939000572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 21.1875, + "completions/mean_terminated_length": 21.1875, + "completions/min_length": 1.5, + "completions/min_terminated_length": 1.5, + "epoch": 0.019590382902938557, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2599256038665771, + "kl": 2.3717754259705544, + "learning_rate": 9.555555555555556e-07, + "loss": -0.02832583151757717, + "num_tokens": 371504.0, + "reward": -0.02500000037252903, + "reward_std": 0.3867208957672119, + "rewards/reward_financial_reasoning/mean": -0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.3867208957672119, + "step": 44, + "step_time": 26.471768608502316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 27.0, + "completions/mean_terminated_length": 11.785714626312256, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.020480854853072127, + "frac_reward_zero_std": 0.75, + "grad_norm": 14.519295692443848, + "kl": 3.7309726029634476, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.22456848621368408, + "num_tokens": 383456.0, + "reward": 0.5375000089406967, + "reward_std": 0.49749018251895905, + "rewards/reward_financial_reasoning/mean": 0.5375000089406967, + "rewards/reward_financial_reasoning/std": 0.49749018251895905, + "step": 46, + "step_time": 41.24796561349831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 54.1875, + "completions/clipped_ratio": 0.125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 130.5, + "completions/mean_length": 54.1875, + "completions/mean_terminated_length": 22.6875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0213713268032057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7381348013877869, + "kl": 4.320437252521515, + "learning_rate": 1.0444444444444445e-06, + "loss": 0.11993282288312912, + "num_tokens": 403243.0, + "reward": -0.1999999973922968, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.1999999973922968, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 48, + "step_time": 79.4801567964987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 13.625, + "completions/mean_terminated_length": 13.625, + "completions/min_length": 1.5, + "completions/min_terminated_length": 1.5, + "epoch": 0.02226179875333927, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1954340785741806, + "kl": 2.8072616159915924, + "learning_rate": 1.0888888888888889e-06, + "loss": 0.022382210940122604, + "num_tokens": 420445.0, + "reward": -0.22500000335276127, + "reward_std": 0.19232525676488876, + "rewards/reward_financial_reasoning/mean": -0.22500000335276127, + "rewards/reward_financial_reasoning/std": 0.19232525676488876, + "step": 50, + "step_time": 15.39180759699775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 19.0625, + "completions/mean_terminated_length": 19.0625, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.02315227070347284, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.9910659790039062, + "kl": 2.668681114912033, + "learning_rate": 1.1333333333333334e-06, + "loss": 0.11818550527095795, + "num_tokens": 442006.0, + "reward": -0.15000000223517418, + "reward_std": 0.3017780929803848, + "rewards/reward_financial_reasoning/mean": -0.15000000223517418, + "rewards/reward_financial_reasoning/std": 0.301778107881546, + "step": 52, + "step_time": 18.692679949499507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.5, + "completions/max_terminated_length": 13.5, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.02404274265360641, + "frac_reward_zero_std": 0.5, + "grad_norm": 5.372073173522949, + "kl": 3.6646086424589157, + "learning_rate": 1.1777777777777778e-06, + "loss": 0.02050051838159561, + "num_tokens": 459962.0, + "reward": -0.09999999683350325, + "reward_std": 0.3260497897863388, + "rewards/reward_financial_reasoning/mean": -0.09999999683350325, + "rewards/reward_financial_reasoning/std": 0.3260497897863388, + "step": 54, + "step_time": 13.365985435504626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 18.3125, + "completions/mean_terminated_length": 18.3125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.024933214603739984, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.28867170214653015, + "kl": 2.8480603992938995, + "learning_rate": 1.2222222222222223e-06, + "loss": 0.015568019822239876, + "num_tokens": 479183.0, + "reward": -0.07500000484287739, + "reward_std": 0.24577751010656357, + "rewards/reward_financial_reasoning/mean": -0.07500000484287739, + "rewards/reward_financial_reasoning/std": 0.24577751755714417, + "step": 56, + "step_time": 16.855563032000646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.5, + "completions/max_terminated_length": 26.5, + "completions/mean_length": 15.0625, + "completions/mean_terminated_length": 15.0625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.025823686553873553, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.5727155208587646, + "kl": 4.796411827206612, + "learning_rate": 1.2666666666666669e-06, + "loss": 0.06868542730808258, + "num_tokens": 498696.0, + "reward": -0.20000000298023224, + "reward_std": 0.3437739461660385, + "rewards/reward_financial_reasoning/mean": -0.20000000298023224, + "rewards/reward_financial_reasoning/std": 0.3437739461660385, + "step": 58, + "step_time": 17.986308140998517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 8.0625, + "completions/mean_terminated_length": 8.0625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.026714158504007122, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5326482653617859, + "kl": 4.87631031870842, + "learning_rate": 1.3111111111111112e-06, + "loss": 0.15867890417575836, + "num_tokens": 520465.0, + "reward": -0.2749999985098839, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.2749999985098839, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 60, + "step_time": 15.652293133502098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 13.125, + "completions/mean_terminated_length": 13.125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.027604630454140695, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.4932446479797363, + "kl": 3.3685058057308197, + "learning_rate": 1.3555555555555558e-06, + "loss": -0.01082983985543251, + "num_tokens": 541939.0, + "reward": -0.04999999701976776, + "reward_std": 0.44950494170188904, + "rewards/reward_financial_reasoning/mean": -0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.44950494170188904, + "step": 62, + "step_time": 18.303554125999653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.5, + "completions/max_terminated_length": 36.5, + "completions/mean_length": 18.125, + "completions/mean_terminated_length": 18.125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.028495102404274265, + "frac_reward_zero_std": 0.5, + "grad_norm": 292.3175048828125, + "kl": 30.459757924079895, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.4946590065956116, + "num_tokens": 560885.0, + "reward": -0.06250000279396772, + "reward_std": 0.24493902921676636, + "rewards/reward_financial_reasoning/mean": -0.06250000279396772, + "rewards/reward_financial_reasoning/std": 0.24493904411792755, + "step": 64, + "step_time": 19.929579762499998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 17.6875, + "completions/mean_terminated_length": 17.6875, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.029385574354407838, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.331460565328598, + "kl": 6.336628198623657, + "learning_rate": 1.4444444444444445e-06, + "loss": 0.2450381964445114, + "num_tokens": 578064.0, + "reward": -0.08750000037252903, + "reward_std": 0.2354431226849556, + "rewards/reward_financial_reasoning/mean": -0.08750000037252903, + "rewards/reward_financial_reasoning/std": 0.2354431226849556, + "step": 66, + "step_time": 16.05476952400386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 14.3125, + "completions/mean_terminated_length": 14.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.030276046304541407, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.422355890274048, + "kl": 1.9846842586994171, + "learning_rate": 1.4888888888888888e-06, + "loss": 0.10696038603782654, + "num_tokens": 590245.0, + "reward": 0.2500000074505806, + "reward_std": 0.257793553173542, + "rewards/reward_financial_reasoning/mean": 0.2500000074505806, + "rewards/reward_financial_reasoning/std": 0.2577935680747032, + "step": 68, + "step_time": 12.55215245600084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 18.0625, + "completions/mean_terminated_length": 18.0625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.031166518254674976, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.34565091133117676, + "kl": 3.1595654487609863, + "learning_rate": 1.5333333333333334e-06, + "loss": -0.003766145557165146, + "num_tokens": 605742.0, + "reward": -0.0625, + "reward_std": 0.3245647996664047, + "rewards/reward_financial_reasoning/mean": -0.0625, + "rewards/reward_financial_reasoning/std": 0.3245648145675659, + "step": 70, + "step_time": 25.756368717500663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 9.1875, + "completions/mean_terminated_length": 9.1875, + "completions/min_length": 3.5, + "completions/min_terminated_length": 3.5, + "epoch": 0.032056990204808546, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.074850082397461, + "kl": 5.072204828262329, + "learning_rate": 1.5777777777777778e-06, + "loss": 0.19083915650844574, + "num_tokens": 630049.0, + "reward": -0.38750000298023224, + "reward_std": 0.155264750123024, + "rewards/reward_financial_reasoning/mean": -0.38750000298023224, + "rewards/reward_financial_reasoning/std": 0.155264750123024, + "step": 72, + "step_time": 19.102319961501053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.0, + "completions/max_terminated_length": 41.0, + "completions/mean_length": 19.125, + "completions/mean_terminated_length": 19.125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.03294746215494212, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7271537780761719, + "kl": 6.776509702205658, + "learning_rate": 1.6222222222222223e-06, + "loss": 0.2730098068714142, + "num_tokens": 644963.0, + "reward": 0.125, + "reward_std": 0.4242233335971832, + "rewards/reward_financial_reasoning/mean": 0.125, + "rewards/reward_financial_reasoning/std": 0.4242233335971832, + "step": 74, + "step_time": 17.979279184004554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.1875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 156.0, + "completions/max_terminated_length": 37.5, + "completions/mean_length": 37.1875, + "completions/mean_terminated_length": 22.23214340209961, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.03383793410507569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6272057294845581, + "kl": 2.333713859319687, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0896148756146431, + "num_tokens": 658926.0, + "reward": -0.12500000186264515, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.12500000186264515, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 76, + "step_time": 48.02580772499823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.5, + "completions/max_terminated_length": 28.5, + "completions/mean_length": 11.375, + "completions/mean_terminated_length": 11.375, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.034728406055209264, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.010223865509033, + "kl": 5.109567791223526, + "learning_rate": 1.7111111111111112e-06, + "loss": -0.013501618057489395, + "num_tokens": 676228.0, + "reward": 0.10000000894069672, + "reward_std": 0.45257411897182465, + "rewards/reward_financial_reasoning/mean": 0.10000000894069672, + "rewards/reward_financial_reasoning/std": 0.45257411897182465, + "step": 78, + "step_time": 16.771125109999048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 14.0625, + "completions/mean_terminated_length": 14.0625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.03561887800534283, + "frac_reward_zero_std": 0.5, + "grad_norm": 5.468640327453613, + "kl": 3.081940993666649, + "learning_rate": 1.7555555555555556e-06, + "loss": -0.07647820562124252, + "num_tokens": 698397.0, + "reward": -0.049999999813735485, + "reward_std": 0.2613307610154152, + "rewards/reward_financial_reasoning/mean": -0.049999999813735485, + "rewards/reward_financial_reasoning/std": 0.2613307684659958, + "step": 80, + "step_time": 19.49620248700012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.125, + "completions/mean_terminated_length": 15.125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.0365093499554764, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.437584161758423, + "kl": 3.713206648826599, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.10175147652626038, + "num_tokens": 719007.0, + "reward": -0.049999999813735485, + "reward_std": 0.2613307684659958, + "rewards/reward_financial_reasoning/mean": -0.049999999813735485, + "rewards/reward_financial_reasoning/std": 0.2613307684659958, + "step": 82, + "step_time": 17.08468607800205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 21.625, + "completions/mean_terminated_length": 21.625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.037399821905609976, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4997353255748749, + "kl": 3.0508928298950195, + "learning_rate": 1.8444444444444445e-06, + "loss": 0.11351507902145386, + "num_tokens": 736329.0, + "reward": 0.06250000093132257, + "reward_std": 0.1060660183429718, + "rewards/reward_financial_reasoning/mean": 0.06250000093132257, + "rewards/reward_financial_reasoning/std": 0.1060660183429718, + "step": 84, + "step_time": 16.044017669501045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 145.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 26.125, + "completions/mean_terminated_length": 10.562500238418579, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.03829029385574354, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.8706929683685303, + "kl": 2.500796929001808, + "learning_rate": 1.888888888888889e-06, + "loss": -0.08194264769554138, + "num_tokens": 749475.0, + "reward": 0.1250000074505806, + "reward_std": 0.37987764179706573, + "rewards/reward_financial_reasoning/mean": 0.1250000074505806, + "rewards/reward_financial_reasoning/std": 0.3798776715993881, + "step": 86, + "step_time": 43.53679231749993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.6875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 141.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 34.6875, + "completions/mean_terminated_length": 19.85714340209961, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.039180765805877114, + "frac_reward_zero_std": 0.75, + "grad_norm": 10.299760818481445, + "kl": 3.631163567304611, + "learning_rate": 1.9333333333333336e-06, + "loss": 0.09746833145618439, + "num_tokens": 773742.0, + "reward": -0.10000000521540642, + "reward_std": 0.09258200973272324, + "rewards/reward_financial_reasoning/mean": -0.10000000521540642, + "rewards/reward_financial_reasoning/std": 0.09258200973272324, + "step": 88, + "step_time": 51.070965140001135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 76.875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 76.875, + "completions/mean_terminated_length": 18.742857933044434, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.04007123775601069, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2004384994506836, + "kl": 4.518661990761757, + "learning_rate": 1.977777777777778e-06, + "loss": 0.21323581039905548, + "num_tokens": 793972.0, + "reward": -0.03749999403953552, + "reward_std": 0.285500705242157, + "rewards/reward_financial_reasoning/mean": -0.03749999403953552, + "rewards/reward_financial_reasoning/std": 0.28550073504447937, + "step": 90, + "step_time": 77.64364999449936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.5, + "completions/max_terminated_length": 13.5, + "completions/mean_length": 9.0625, + "completions/mean_terminated_length": 9.0625, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.04096170970614425, + "frac_reward_zero_std": 1.0, + "grad_norm": 15.11466121673584, + "kl": 4.976789236068726, + "learning_rate": 2.0222222222222223e-06, + "loss": 0.16917964816093445, + "num_tokens": 806405.0, + "reward": 0.6749999970197678, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": 0.6749999970197678, + "rewards/reward_financial_reasoning/std": 0.24053513258695602, + "step": 92, + "step_time": 11.1175422624965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 13.1875, + "completions/mean_terminated_length": 13.1875, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.041852181656277826, + "frac_reward_zero_std": 1.0, + "grad_norm": 11.747920989990234, + "kl": 5.53629332780838, + "learning_rate": 2.0666666666666666e-06, + "loss": 0.21535387635231018, + "num_tokens": 819672.0, + "reward": 0.2500000037252903, + "reward_std": 0.32071349024772644, + "rewards/reward_financial_reasoning/mean": 0.2500000037252903, + "rewards/reward_financial_reasoning/std": 0.32071349024772644, + "step": 94, + "step_time": 15.913111352499982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.25, + "completions/mean_terminated_length": 17.25, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.0427426536064114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10250017791986465, + "kl": 2.3222732543945312, + "learning_rate": 2.1111111111111114e-06, + "loss": 0.09148351848125458, + "num_tokens": 837228.0, + "reward": 0.45000001788139343, + "reward_std": 0.37416574358940125, + "rewards/reward_financial_reasoning/mean": 0.45000001788139343, + "rewards/reward_financial_reasoning/std": 0.37416577339172363, + "step": 96, + "step_time": 14.932274136997876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 60.8125, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 134.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 60.8125, + "completions/mean_terminated_length": 17.6875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.04363312555654497, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.2879371643066406, + "kl": 2.264972969889641, + "learning_rate": 2.1555555555555558e-06, + "loss": 0.09448125958442688, + "num_tokens": 852985.0, + "reward": -0.0625, + "reward_std": 0.3245647996664047, + "rewards/reward_financial_reasoning/mean": -0.0625, + "rewards/reward_financial_reasoning/std": 0.3245648145675659, + "step": 98, + "step_time": 42.98987148800006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 21.0625, + "completions/mean_terminated_length": 21.0625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.04452359750667854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05980491638183594, + "kl": 2.593031495809555, + "learning_rate": 2.2e-06, + "loss": 0.10243881493806839, + "num_tokens": 866602.0, + "reward": -0.15000000223517418, + "reward_std": 0.05345224589109421, + "rewards/reward_financial_reasoning/mean": -0.15000000223517418, + "rewards/reward_financial_reasoning/std": 0.05345224589109421, + "step": 100, + "step_time": 13.76024783450157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 14.625, + "completions/mean_terminated_length": 14.625, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.04541406945681211, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.663613319396973, + "kl": 3.8922190964221954, + "learning_rate": 2.2444444444444445e-06, + "loss": 0.06969564408063889, + "num_tokens": 881948.0, + "reward": 0.3124999888241291, + "reward_std": 0.30207616090774536, + "rewards/reward_financial_reasoning/mean": 0.3124999888241291, + "rewards/reward_financial_reasoning/std": 0.302076131105423, + "step": 102, + "step_time": 13.194114739499128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.5, + "completions/max_terminated_length": 38.5, + "completions/mean_length": 22.1875, + "completions/mean_terminated_length": 22.1875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.04630454140694568, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29489243030548096, + "kl": 2.4103888869285583, + "learning_rate": 2.2888888888888892e-06, + "loss": 0.09568312764167786, + "num_tokens": 901039.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 104, + "step_time": 20.464529470000343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.1875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 176.5, + "completions/max_terminated_length": 60.5, + "completions/mean_length": 34.1875, + "completions/mean_terminated_length": 19.3125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.04719501335707925, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2691603899002075, + "kl": 2.7120189517736435, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.07602652907371521, + "num_tokens": 922626.0, + "reward": -0.05000000074505806, + "reward_std": 0.21905138343572617, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.21905138343572617, + "step": 106, + "step_time": 58.071344395997585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.0, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 18.25, + "completions/mean_terminated_length": 18.25, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.04808548530721282, + "frac_reward_zero_std": 0.5, + "grad_norm": 6.271705627441406, + "kl": 3.232674241065979, + "learning_rate": 2.377777777777778e-06, + "loss": 0.07806958258152008, + "num_tokens": 934790.0, + "reward": 0.050000011920928955, + "reward_std": 0.42308470606803894, + "rewards/reward_financial_reasoning/mean": 0.050000011920928955, + "rewards/reward_financial_reasoning/std": 0.4230847507715225, + "step": 108, + "step_time": 16.047973144000935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 24.3125, + "completions/mean_terminated_length": 24.3125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.048975957257346395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09767260402441025, + "kl": 2.3331067264080048, + "learning_rate": 2.4222222222222223e-06, + "loss": 0.09566066414117813, + "num_tokens": 950203.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 110, + "step_time": 24.890910685500785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.6875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 135.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 35.6875, + "completions/mean_terminated_length": 21.598215103149414, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.04986642920747997, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7795641422271729, + "kl": 7.069983154535294, + "learning_rate": 2.466666666666667e-06, + "loss": 0.3206178545951843, + "num_tokens": 972878.0, + "reward": -0.01249999925494194, + "reward_std": 0.12464234232902527, + "rewards/reward_financial_reasoning/mean": -0.01249999925494194, + "rewards/reward_financial_reasoning/std": 0.12464234232902527, + "step": 112, + "step_time": 47.53451731649693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 24.875, + "completions/mean_terminated_length": 24.875, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.05075690115761353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.755477786064148, + "kl": 2.44833767414093, + "learning_rate": 2.5111111111111114e-06, + "loss": -0.027675483375787735, + "num_tokens": 991140.0, + "reward": 0.08750000596046448, + "reward_std": 0.3419739603996277, + "rewards/reward_financial_reasoning/mean": 0.08750000596046448, + "rewards/reward_financial_reasoning/std": 0.34197400510311127, + "step": 114, + "step_time": 18.244106624995766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 14.9375, + "completions/mean_terminated_length": 14.9375, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.051647373107747106, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7609986066818237, + "kl": 3.860931694507599, + "learning_rate": 2.5555555555555557e-06, + "loss": 0.11580531299114227, + "num_tokens": 1007059.0, + "reward": 0.04999999701976776, + "reward_std": 0.3033005967736244, + "rewards/reward_financial_reasoning/mean": 0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.3033006191253662, + "step": 116, + "step_time": 14.473379719500372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.5, + "completions/max_terminated_length": 35.5, + "completions/mean_length": 21.375, + "completions/mean_terminated_length": 21.375, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.05253784505788068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042947202920913696, + "kl": 5.62386429309845, + "learning_rate": 2.6e-06, + "loss": 0.17796431481838226, + "num_tokens": 1027377.0, + "reward": -0.07500000484287739, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": -0.07500000484287739, + "rewards/reward_financial_reasoning/std": 0.18708287924528122, + "step": 118, + "step_time": 20.568094111997198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.5, + "completions/max_terminated_length": 69.5, + "completions/mean_length": 31.625, + "completions/mean_terminated_length": 31.625, + "completions/min_length": 2.5, + "completions/min_terminated_length": 2.5, + "epoch": 0.053428317008014245, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8792534470558167, + "kl": 4.0221880078315735, + "learning_rate": 2.6444444444444444e-06, + "loss": 0.06174422800540924, + "num_tokens": 1047523.0, + "reward": -0.1374999973922968, + "reward_std": 0.3001621291041374, + "rewards/reward_financial_reasoning/mean": -0.1374999973922968, + "rewards/reward_financial_reasoning/std": 0.3001621440052986, + "step": 120, + "step_time": 29.076913421999052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 21.6875, + "completions/mean_terminated_length": 21.6875, + "completions/min_length": 17.5, + "completions/min_terminated_length": 17.5, + "epoch": 0.05431878895814782, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.11915815621614456, + "kl": 2.785850167274475, + "learning_rate": 2.6888888888888892e-06, + "loss": 0.12607991695404053, + "num_tokens": 1065086.0, + "reward": -0.025000005960464478, + "reward_std": 0.1776151806116104, + "rewards/reward_financial_reasoning/mean": -0.025000005960464478, + "rewards/reward_financial_reasoning/std": 0.177615188062191, + "step": 122, + "step_time": 15.821294531000603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.6875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 134.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 24.6875, + "completions/mean_terminated_length": 9.151785850524902, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.05520926090828139, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.4671032428741455, + "kl": 3.9387396574020386, + "learning_rate": 2.7333333333333336e-06, + "loss": 0.13445711135864258, + "num_tokens": 1082913.0, + "reward": 0.050000011920928955, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.050000011920928955, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 124, + "step_time": 44.41438667900002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 17.5625, + "completions/mean_terminated_length": 17.5625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.05609973285841496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21380269527435303, + "kl": 2.8844070732593536, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.10589596629142761, + "num_tokens": 1103354.0, + "reward": 0.1250000037252903, + "reward_std": 0.34743958711624146, + "rewards/reward_financial_reasoning/mean": 0.1250000037252903, + "rewards/reward_financial_reasoning/std": 0.34743958711624146, + "step": 126, + "step_time": 16.954522954500135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 137.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 9.357142925262451, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.05699020480854853, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.337584972381592, + "kl": 5.580496460199356, + "learning_rate": 2.8222222222222223e-06, + "loss": 0.20137156546115875, + "num_tokens": 1120778.0, + "reward": -0.04999999701976776, + "reward_std": 0.37416573613882065, + "rewards/reward_financial_reasoning/mean": -0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.37416573613882065, + "step": 128, + "step_time": 45.34434924549896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.0578806767586821, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.981386661529541, + "kl": 3.401599198579788, + "learning_rate": 2.866666666666667e-06, + "loss": 0.12407220900058746, + "num_tokens": 1141394.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 130, + "step_time": 16.4087845009999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.5, + "completions/max_terminated_length": 16.5, + "completions/mean_length": 10.9375, + "completions/mean_terminated_length": 10.9375, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.058771148708815675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20108631253242493, + "kl": 3.4849415719509125, + "learning_rate": 2.9111111111111114e-06, + "loss": 0.12001919746398926, + "num_tokens": 1154313.0, + "reward": 0.3499999865889549, + "reward_std": 0.21380899101495743, + "rewards/reward_financial_reasoning/mean": 0.3499999865889549, + "rewards/reward_financial_reasoning/std": 0.21380899101495743, + "step": 132, + "step_time": 12.112006235996887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 12.0625, + "completions/mean_terminated_length": 12.0625, + "completions/min_length": 3.5, + "completions/min_terminated_length": 3.5, + "epoch": 0.05966162065894924, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.684659957885742, + "kl": 3.3992087990045547, + "learning_rate": 2.955555555555556e-06, + "loss": 0.1022736206650734, + "num_tokens": 1171394.0, + "reward": 0.1374999936670065, + "reward_std": 0.4541053995490074, + "rewards/reward_financial_reasoning/mean": 0.1374999936670065, + "rewards/reward_financial_reasoning/std": 0.4541054293513298, + "step": 134, + "step_time": 14.642255357997783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.5, + "completions/max_terminated_length": 13.5, + "completions/mean_length": 11.375, + "completions/mean_terminated_length": 11.375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.060552092609082814, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3956330716609955, + "kl": 4.586791276931763, + "learning_rate": 3e-06, + "loss": 0.12092146277427673, + "num_tokens": 1186376.0, + "reward": 0.21250000968575478, + "reward_std": 0.20310094952583313, + "rewards/reward_financial_reasoning/mean": 0.21250000968575478, + "rewards/reward_financial_reasoning/std": 0.20310097932815552, + "step": 136, + "step_time": 11.559512399000596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.75, + "completions/mean_terminated_length": 10.75, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.06144256455921639, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.993830680847168, + "kl": 3.829523801803589, + "learning_rate": 3.044444444444445e-06, + "loss": 0.13624754548072815, + "num_tokens": 1201172.0, + "reward": -0.0624999962747097, + "reward_std": 0.2199837565422058, + "rewards/reward_financial_reasoning/mean": -0.0624999962747097, + "rewards/reward_financial_reasoning/std": 0.219983771443367, + "step": 138, + "step_time": 12.256231981002202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 149.0, + "completions/max_terminated_length": 34.5, + "completions/mean_length": 35.375, + "completions/mean_terminated_length": 20.723215103149414, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.06233303650934995, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.678333282470703, + "kl": 3.021816372871399, + "learning_rate": 3.088888888888889e-06, + "loss": 0.032898858189582825, + "num_tokens": 1223026.0, + "reward": -0.08750000037252903, + "reward_std": 0.20482071489095688, + "rewards/reward_financial_reasoning/mean": -0.08750000037252903, + "rewards/reward_financial_reasoning/std": 0.20482071489095688, + "step": 140, + "step_time": 50.93402997600242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 11.5, + "completions/mean_terminated_length": 11.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.06322350845948353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1813589483499527, + "kl": 4.437779903411865, + "learning_rate": 3.133333333333334e-06, + "loss": 0.1627044826745987, + "num_tokens": 1245666.0, + "reward": -0.30000000447034836, + "reward_std": 0.21380899101495743, + "rewards/reward_financial_reasoning/mean": -0.30000000447034836, + "rewards/reward_financial_reasoning/std": 0.21380899101495743, + "step": 142, + "step_time": 18.050760761499987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.75, + "completions/mean_terminated_length": 14.75, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.06411398040961709, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23129655420780182, + "kl": 2.1533931493759155, + "learning_rate": 3.177777777777778e-06, + "loss": 0.08247792720794678, + "num_tokens": 1258398.0, + "reward": 0.45000001788139343, + "reward_std": 0.37416574358940125, + "rewards/reward_financial_reasoning/mean": 0.45000001788139343, + "rewards/reward_financial_reasoning/std": 0.37416577339172363, + "step": 144, + "step_time": 11.140301175000786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 15.125, + "completions/mean_terminated_length": 15.125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.06500445235975066, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.715083599090576, + "kl": 2.318701334297657, + "learning_rate": 3.2222222222222227e-06, + "loss": -0.08997282385826111, + "num_tokens": 1279640.0, + "reward": 0.07500000298023224, + "reward_std": 0.3218744471669197, + "rewards/reward_financial_reasoning/mean": 0.07500000298023224, + "rewards/reward_financial_reasoning/std": 0.3218744695186615, + "step": 146, + "step_time": 17.162366078498962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.06589492430988424, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4836774468421936, + "kl": 3.3755117654800415, + "learning_rate": 3.266666666666667e-06, + "loss": 0.18510375916957855, + "num_tokens": 1296112.0, + "reward": 0.07500000111758709, + "reward_std": 0.276574470102787, + "rewards/reward_financial_reasoning/mean": 0.07500000111758709, + "rewards/reward_financial_reasoning/std": 0.2765744850039482, + "step": 148, + "step_time": 13.026451161000296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.5625, + "completions/mean_terminated_length": 18.5625, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.06678539626001781, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11278322339057922, + "kl": 2.8200909793376923, + "learning_rate": 3.3111111111111118e-06, + "loss": 0.11123226583003998, + "num_tokens": 1311249.0, + "reward": -0.15000000223517418, + "reward_std": 0.05345224589109421, + "rewards/reward_financial_reasoning/mean": -0.15000000223517418, + "rewards/reward_financial_reasoning/std": 0.05345224589109421, + "step": 150, + "step_time": 14.589614674499899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 132.5, + "completions/max_terminated_length": 10.5, + "completions/mean_length": 24.0, + "completions/mean_terminated_length": 8.633928775787354, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.06767586821015138, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.057661056518555, + "kl": 25.710087820887566, + "learning_rate": 3.3555555555555557e-06, + "loss": 1.1084628105163574, + "num_tokens": 1330345.0, + "reward": -0.012499988079071045, + "reward_std": 0.28327932208776474, + "rewards/reward_financial_reasoning/mean": -0.012499988079071045, + "rewards/reward_financial_reasoning/std": 0.2832793518900871, + "step": 152, + "step_time": 44.78167107950139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.5, + "completions/max_terminated_length": 39.5, + "completions/mean_length": 28.375, + "completions/mean_terminated_length": 28.375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.06856634016028496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4117250144481659, + "kl": 2.3949645459651947, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.09245544672012329, + "num_tokens": 1352247.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 154, + "step_time": 22.419895262502905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.5, + "completions/max_terminated_length": 34.5, + "completions/mean_length": 19.625, + "completions/mean_terminated_length": 19.625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.06945681211041853, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.388998031616211, + "kl": 4.125469475984573, + "learning_rate": 3.444444444444445e-06, + "loss": 0.14830118417739868, + "num_tokens": 1368657.0, + "reward": 0.3999999836087227, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": 0.3999999836087227, + "rewards/reward_financial_reasoning/std": 0.16035675257444382, + "step": 156, + "step_time": 17.970027042503716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 21.625, + "completions/mean_terminated_length": 21.625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.07034728406055209, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.199248790740967, + "kl": 2.5577471256256104, + "learning_rate": 3.4888888888888896e-06, + "loss": 0.031034421175718307, + "num_tokens": 1392611.0, + "reward": 0.06250000093132257, + "reward_std": 0.1060660183429718, + "rewards/reward_financial_reasoning/mean": 0.06250000093132257, + "rewards/reward_financial_reasoning/std": 0.1060660183429718, + "step": 158, + "step_time": 19.871817894503693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.0625, + "completions/mean_terminated_length": 21.0625, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.07123775601068566, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.693187713623047, + "kl": 2.236900717020035, + "learning_rate": 3.5333333333333335e-06, + "loss": 0.10101586580276489, + "num_tokens": 1416028.0, + "reward": 0.02500000037252903, + "reward_std": 0.13887301087379456, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.13887301087379456, + "step": 160, + "step_time": 20.560153357999297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.5, + "completions/max_terminated_length": 30.5, + "completions/mean_length": 22.9375, + "completions/mean_terminated_length": 22.9375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.07212822796081923, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08777616918087006, + "kl": 2.303772583603859, + "learning_rate": 3.577777777777778e-06, + "loss": 0.09048432111740112, + "num_tokens": 1435923.0, + "reward": -0.11250000633299351, + "reward_std": 0.1989518702030182, + "rewards/reward_financial_reasoning/mean": -0.11250000633299351, + "rewards/reward_financial_reasoning/std": 0.1989518627524376, + "step": 162, + "step_time": 18.901748009000585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.0730186999109528, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.7375295162200928, + "kl": 3.2199981957674026, + "learning_rate": 3.6222222222222226e-06, + "loss": 0.10302453488111496, + "num_tokens": 1454867.0, + "reward": 0.26250001043081284, + "reward_std": 0.3512909263372421, + "rewards/reward_financial_reasoning/mean": 0.26250001043081284, + "rewards/reward_financial_reasoning/std": 0.3512909561395645, + "step": 164, + "step_time": 16.01710794649989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.5, + "completions/max_terminated_length": 29.5, + "completions/mean_length": 13.6875, + "completions/mean_terminated_length": 13.6875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.07390917186108638, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24207016825675964, + "kl": 4.016330122947693, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.15451282262802124, + "num_tokens": 1474742.0, + "reward": -0.04999999701976776, + "reward_std": 0.37416573613882065, + "rewards/reward_financial_reasoning/mean": -0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.37416573613882065, + "step": 166, + "step_time": 18.76729526700001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.125, + "completions/mean_terminated_length": 11.125, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.07479964381121995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.42672666907310486, + "kl": 4.4028559923172, + "learning_rate": 3.7111111111111113e-06, + "loss": 0.17786993086338043, + "num_tokens": 1492792.0, + "reward": 0.02499999850988388, + "reward_std": 0.45434408634901047, + "rewards/reward_financial_reasoning/mean": 0.02499999850988388, + "rewards/reward_financial_reasoning/std": 0.45434409379959106, + "step": 168, + "step_time": 14.194298194499424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 14.6875, + "completions/mean_terminated_length": 14.6875, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.07569011576135352, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.46550452709198, + "kl": 3.6853742003440857, + "learning_rate": 3.7555555555555557e-06, + "loss": 0.14712777733802795, + "num_tokens": 1510931.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 170, + "step_time": 15.002850790498997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 16.5, + "completions/min_terminated_length": 16.5, + "epoch": 0.07658058771148708, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6111212968826294, + "kl": 3.136908233165741, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.1122005432844162, + "num_tokens": 1533875.0, + "reward": 0.037499998696148396, + "reward_std": 0.1767766922712326, + "rewards/reward_financial_reasoning/mean": 0.037499998696148396, + "rewards/reward_financial_reasoning/std": 0.1767766997218132, + "step": 172, + "step_time": 18.050798523498088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.875, + "completions/mean_terminated_length": 14.875, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.07747105966162066, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12939698994159698, + "kl": 2.4790256321430206, + "learning_rate": 3.844444444444445e-06, + "loss": 0.020719770342111588, + "num_tokens": 1548265.0, + "reward": 0.21250000968575478, + "reward_std": 0.20310094952583313, + "rewards/reward_financial_reasoning/mean": 0.21250000968575478, + "rewards/reward_financial_reasoning/std": 0.20310097932815552, + "step": 174, + "step_time": 12.47596741849884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 13.9375, + "completions/mean_terminated_length": 13.9375, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.07836153161175423, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.401264190673828, + "kl": 5.6927982568740845, + "learning_rate": 3.88888888888889e-06, + "loss": 0.09270425140857697, + "num_tokens": 1564544.0, + "reward": 0.125, + "reward_std": 0.3060004860162735, + "rewards/reward_financial_reasoning/mean": 0.125, + "rewards/reward_financial_reasoning/std": 0.3060004934668541, + "step": 176, + "step_time": 15.803215666997858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.5, + "completions/max_terminated_length": 26.5, + "completions/mean_length": 18.6875, + "completions/mean_terminated_length": 18.6875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.0792520035618878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2923208177089691, + "kl": 3.5329647958278656, + "learning_rate": 3.9333333333333335e-06, + "loss": 0.14282937347888947, + "num_tokens": 1585955.0, + "reward": -0.07499999925494194, + "reward_std": 0.026726126670837402, + "rewards/reward_financial_reasoning/mean": -0.07499999925494194, + "rewards/reward_financial_reasoning/std": 0.026726126670837402, + "step": 178, + "step_time": 18.7747930500027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 19.9375, + "completions/mean_terminated_length": 19.9375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.08014247551202137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2639215886592865, + "kl": 6.156628400087357, + "learning_rate": 3.977777777777778e-06, + "loss": 0.19962944090366364, + "num_tokens": 1610266.0, + "reward": -0.32500000298023224, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": -0.32500000298023224, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 180, + "step_time": 21.797623717498936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 133.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 24.625, + "completions/mean_terminated_length": 9.339285850524902, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.08103294746215495, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0663926601409912, + "kl": 3.6094931960105896, + "learning_rate": 4.022222222222222e-06, + "loss": 0.20474201440811157, + "num_tokens": 1627628.0, + "reward": 0.08750000596046448, + "reward_std": 0.6723021864891052, + "rewards/reward_financial_reasoning/mean": 0.08750000596046448, + "rewards/reward_financial_reasoning/std": 0.6723021864891052, + "step": 182, + "step_time": 44.85433911799737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.0819234194122885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2525313198566437, + "kl": 3.37558776140213, + "learning_rate": 4.066666666666667e-06, + "loss": 0.13418884575366974, + "num_tokens": 1645004.0, + "reward": 0.025000005960464478, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": 0.025000005960464478, + "rewards/reward_financial_reasoning/std": 0.24053513258695602, + "step": 184, + "step_time": 14.621046053998725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.08281389136242208, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.167638301849365, + "kl": 3.0642621517181396, + "learning_rate": 4.111111111111111e-06, + "loss": 0.0730942115187645, + "num_tokens": 1665756.0, + "reward": -0.15000000409781933, + "reward_std": 0.27403824776411057, + "rewards/reward_financial_reasoning/mean": -0.15000000409781933, + "rewards/reward_financial_reasoning/std": 0.27403824031352997, + "step": 186, + "step_time": 18.20900296650325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.0, + "completions/mean_terminated_length": 18.0, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.08370436331255565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09201586991548538, + "kl": 2.442968338727951, + "learning_rate": 4.155555555555556e-06, + "loss": 0.09587804228067398, + "num_tokens": 1684332.0, + "reward": -0.07500000484287739, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": -0.07500000484287739, + "rewards/reward_financial_reasoning/std": 0.18708287924528122, + "step": 188, + "step_time": 15.8890612949981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 45.0625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 148.0, + "completions/max_terminated_length": 54.5, + "completions/mean_length": 45.0625, + "completions/mean_terminated_length": 30.642857551574707, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.08459483526268922, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.757686972618103, + "kl": 2.118753992021084, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.025539696216583252, + "num_tokens": 1703053.0, + "reward": -0.012500002980232239, + "reward_std": 0.2711125537753105, + "rewards/reward_financial_reasoning/mean": -0.012500002980232239, + "rewards/reward_financial_reasoning/std": 0.2711125761270523, + "step": 190, + "step_time": 48.18112432499947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 16.375, + "completions/mean_terminated_length": 16.375, + "completions/min_length": 3.5, + "completions/min_terminated_length": 3.5, + "epoch": 0.0854853072128228, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.921569585800171, + "kl": 3.9216238781809807, + "learning_rate": 4.244444444444445e-06, + "loss": 0.12688420712947845, + "num_tokens": 1716339.0, + "reward": 0.1750000026077032, + "reward_std": 0.40089186280965805, + "rewards/reward_financial_reasoning/mean": 0.1750000026077032, + "rewards/reward_financial_reasoning/std": 0.40089186280965805, + "step": 192, + "step_time": 22.157812262499647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.5, + "completions/max_terminated_length": 47.5, + "completions/mean_length": 35.375, + "completions/mean_terminated_length": 35.375, + "completions/min_length": 24.5, + "completions/min_terminated_length": 24.5, + "epoch": 0.08637577916295637, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.06356346607208252, + "kl": 2.7308596670627594, + "learning_rate": 4.288888888888889e-06, + "loss": 0.12605991959571838, + "num_tokens": 1736721.0, + "reward": -0.17500000447034836, + "reward_std": 0.1776151806116104, + "rewards/reward_financial_reasoning/mean": -0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.177615188062191, + "step": 194, + "step_time": 23.11299426900041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 19.875, + "completions/mean_terminated_length": 19.875, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.08726625111308994, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.439833164215088, + "kl": 2.893902435898781, + "learning_rate": 4.333333333333334e-06, + "loss": 0.14680534601211548, + "num_tokens": 1752303.0, + "reward": 0.04999999701976776, + "reward_std": 0.2905927076935768, + "rewards/reward_financial_reasoning/mean": 0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.290592722594738, + "step": 196, + "step_time": 16.636957890499616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.5, + "completions/max_terminated_length": 48.5, + "completions/mean_length": 23.0625, + "completions/mean_terminated_length": 23.0625, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.0881567230632235, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.632042407989502, + "kl": 2.5856750905513763, + "learning_rate": 4.377777777777778e-06, + "loss": 0.02410995587706566, + "num_tokens": 1768720.0, + "reward": 0.11250000447034836, + "reward_std": 0.31000544875860214, + "rewards/reward_financial_reasoning/mean": 0.11250000447034836, + "rewards/reward_financial_reasoning/std": 0.3100054860115051, + "step": 198, + "step_time": 21.464168812499338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 13.9375, + "completions/mean_terminated_length": 13.9375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.08904719501335707, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.4800167083740234, + "kl": 3.6862347424030304, + "learning_rate": 4.422222222222223e-06, + "loss": 0.1973346471786499, + "num_tokens": 1789447.0, + "reward": -0.08749999850988388, + "reward_std": 0.485219344496727, + "rewards/reward_financial_reasoning/mean": -0.08749999850988388, + "rewards/reward_financial_reasoning/std": 0.485219344496727, + "step": 200, + "step_time": 22.986546036001528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 14.4375, + "completions/mean_terminated_length": 14.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.08993766696349065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7083855271339417, + "kl": 2.8951866924762726, + "learning_rate": 4.4666666666666665e-06, + "loss": 0.12021103501319885, + "num_tokens": 1807526.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 202, + "step_time": 14.811557051994896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 14.9375, + "completions/mean_terminated_length": 14.9375, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.09082813891362422, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.18487024307251, + "kl": 4.287087470293045, + "learning_rate": 4.511111111111111e-06, + "loss": 0.10975737869739532, + "num_tokens": 1825821.0, + "reward": -0.012499988079071045, + "reward_std": 0.3419739603996277, + "rewards/reward_financial_reasoning/mean": -0.012499988079071045, + "rewards/reward_financial_reasoning/std": 0.3419739902019501, + "step": 204, + "step_time": 16.611893776500438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 143.0, + "completions/max_terminated_length": 29.5, + "completions/mean_length": 36.4375, + "completions/mean_terminated_length": 21.785715103149414, + "completions/min_length": 16.5, + "completions/min_terminated_length": 16.5, + "epoch": 0.0917186108637578, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.862529277801514, + "kl": 2.1467432379722595, + "learning_rate": 4.555555555555556e-06, + "loss": 0.1539805829524994, + "num_tokens": 1846972.0, + "reward": 0.11250000447034836, + "reward_std": 0.31000544875860214, + "rewards/reward_financial_reasoning/mean": 0.11250000447034836, + "rewards/reward_financial_reasoning/std": 0.3100054860115051, + "step": 206, + "step_time": 50.025916307500665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.5, + "completions/max_terminated_length": 32.5, + "completions/mean_length": 16.5625, + "completions/mean_terminated_length": 16.5625, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.09260908281389137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3114466369152069, + "kl": 2.7216466069221497, + "learning_rate": 4.600000000000001e-06, + "loss": 0.10927754640579224, + "num_tokens": 1859013.0, + "reward": 0.07500000298023224, + "reward_std": 0.34743961691856384, + "rewards/reward_financial_reasoning/mean": 0.07500000298023224, + "rewards/reward_financial_reasoning/std": 0.34743961691856384, + "step": 208, + "step_time": 14.936293552002098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.5, + "completions/max_terminated_length": 29.5, + "completions/mean_length": 19.8125, + "completions/mean_terminated_length": 19.8125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.09349955476402494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16725488007068634, + "kl": 2.662395626306534, + "learning_rate": 4.644444444444445e-06, + "loss": 0.11168936640024185, + "num_tokens": 1880250.0, + "reward": -0.15000000223517418, + "reward_std": 0.05345224589109421, + "rewards/reward_financial_reasoning/mean": -0.15000000223517418, + "rewards/reward_financial_reasoning/std": 0.05345224589109421, + "step": 210, + "step_time": 19.786764974996913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 18.5, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.0943900267141585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07584002614021301, + "kl": 2.111318424344063, + "learning_rate": 4.6888888888888895e-06, + "loss": 0.08701398968696594, + "num_tokens": 1897586.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 212, + "step_time": 15.371527886505646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.5, + "completions/max_terminated_length": 47.5, + "completions/mean_length": 22.125, + "completions/mean_terminated_length": 22.125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.09528049866429207, + "frac_reward_zero_std": 0.75, + "grad_norm": 13.01284408569336, + "kl": 4.123442500829697, + "learning_rate": 4.7333333333333335e-06, + "loss": 0.21884416043758392, + "num_tokens": 1918428.0, + "reward": -0.03750000149011612, + "reward_std": 0.4317670986056328, + "rewards/reward_financial_reasoning/mean": -0.03750000149011612, + "rewards/reward_financial_reasoning/std": 0.4317671060562134, + "step": 214, + "step_time": 24.531285858996853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 12.375, + "completions/mean_terminated_length": 12.375, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.09617097061442564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17621411383152008, + "kl": 2.0488149896264076, + "learning_rate": 4.777777777777778e-06, + "loss": 0.08796633780002594, + "num_tokens": 1929210.0, + "reward": 0.42500001192092896, + "reward_std": 0.5077963620424271, + "rewards/reward_financial_reasoning/mean": 0.42500001192092896, + "rewards/reward_financial_reasoning/std": 0.5077963769435883, + "step": 216, + "step_time": 12.308896507001919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.5, + "completions/max_terminated_length": 15.5, + "completions/mean_length": 11.125, + "completions/mean_terminated_length": 11.125, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.09706144256455922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2214796096086502, + "kl": 3.1335892230272293, + "learning_rate": 4.822222222222222e-06, + "loss": 0.11833228915929794, + "num_tokens": 1949284.0, + "reward": 0.1250000074505806, + "reward_std": 0.34743961691856384, + "rewards/reward_financial_reasoning/mean": 0.1250000074505806, + "rewards/reward_financial_reasoning/std": 0.34743963181972504, + "step": 218, + "step_time": 15.425380380998831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 18.4375, + "completions/mean_terminated_length": 18.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.09795191451469279, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.749892234802246, + "kl": 3.3727396726608276, + "learning_rate": 4.866666666666667e-06, + "loss": 0.13246247172355652, + "num_tokens": 1964643.0, + "reward": 0.050000001676380634, + "reward_std": 0.2702740728855133, + "rewards/reward_financial_reasoning/mean": 0.050000001676380634, + "rewards/reward_financial_reasoning/std": 0.2702740877866745, + "step": 220, + "step_time": 15.254870478507655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.5, + "completions/max_terminated_length": 87.5, + "completions/mean_length": 20.4375, + "completions/mean_terminated_length": 20.4375, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.09884238646482636, + "frac_reward_zero_std": 1.0, + "grad_norm": 86.07991790771484, + "kl": 5.883344158530235, + "learning_rate": 4.911111111111112e-06, + "loss": 0.24957503378391266, + "num_tokens": 1976706.0, + "reward": 0.25000000558793545, + "reward_std": 0.32071348279714584, + "rewards/reward_financial_reasoning/mean": 0.25000000558793545, + "rewards/reward_financial_reasoning/std": 0.32071349769830704, + "step": 222, + "step_time": 29.745959900501475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.09973285841495994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13994386792182922, + "kl": 3.634816884994507, + "learning_rate": 4.9555555555555565e-06, + "loss": 0.14822566509246826, + "num_tokens": 1990182.0, + "reward": 0.050000011920928955, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.050000011920928955, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 224, + "step_time": 14.035459309507132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 132.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 27.625, + "completions/mean_terminated_length": 12.758929252624512, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.1006233303650935, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.492895126342773, + "kl": 3.6512217223644257, + "learning_rate": 5e-06, + "loss": 0.10606303811073303, + "num_tokens": 2012120.0, + "reward": -0.14999999850988388, + "reward_std": 0.4457136541604996, + "rewards/reward_financial_reasoning/mean": -0.14999999850988388, + "rewards/reward_financial_reasoning/std": 0.4457136541604996, + "step": 226, + "step_time": 48.76606522999282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 9.625, + "completions/mean_terminated_length": 9.625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.10151380231522707, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.535083293914795, + "kl": 2.9311847388744354, + "learning_rate": 4.995051954477982e-06, + "loss": 0.1752665936946869, + "num_tokens": 2024234.0, + "reward": 0.17500000447034836, + "reward_std": 0.3918117731809616, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.3918117731809616, + "step": 228, + "step_time": 11.424557798505703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 144.0, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 31.875, + "completions/mean_terminated_length": 16.25000023841858, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.10240427426536064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1962670236825943, + "kl": 2.9220656007528305, + "learning_rate": 4.990103908955963e-06, + "loss": 0.10569591820240021, + "num_tokens": 2039264.0, + "reward": 0.3999999836087227, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": 0.3999999836087227, + "rewards/reward_financial_reasoning/std": 0.16035675257444382, + "step": 230, + "step_time": 45.636118210997665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 13.9375, + "completions/mean_terminated_length": 13.9375, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.10329474621549421, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.06443369388580322, + "kl": 4.253673791885376, + "learning_rate": 4.985155863433944e-06, + "loss": 0.19769635796546936, + "num_tokens": 2050935.0, + "reward": 0.41249997168779373, + "reward_std": 0.1586594134569168, + "rewards/reward_financial_reasoning/mean": 0.41249997168779373, + "rewards/reward_financial_reasoning/std": 0.1586594209074974, + "step": 232, + "step_time": 11.086136656002054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 18.5625, + "completions/mean_terminated_length": 18.5625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.10418521816562779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1366126388311386, + "kl": 2.861827075481415, + "learning_rate": 4.980207817911925e-06, + "loss": 0.1160162016749382, + "num_tokens": 2071048.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 234, + "step_time": 17.117513173492625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 12.9375, + "completions/mean_terminated_length": 12.9375, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.10507569011576136, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.253388404846191, + "kl": 4.99092635512352, + "learning_rate": 4.975259772389907e-06, + "loss": 0.1833035945892334, + "num_tokens": 2086695.0, + "reward": -0.375, + "reward_std": 0.13363061845302582, + "rewards/reward_financial_reasoning/mean": -0.375, + "rewards/reward_financial_reasoning/std": 0.13363061845302582, + "step": 236, + "step_time": 15.18828914400001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.10596616206589493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08144163340330124, + "kl": 2.5794699490070343, + "learning_rate": 4.970311726867888e-06, + "loss": 0.10143584758043289, + "num_tokens": 2102087.0, + "reward": -0.10000000894069672, + "reward_std": 0.21380899846553802, + "rewards/reward_financial_reasoning/mean": -0.10000000894069672, + "rewards/reward_financial_reasoning/std": 0.21380901336669922, + "step": 238, + "step_time": 19.704446994994214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.0, + "completions/max_terminated_length": 53.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.10685663401602849, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.362489700317383, + "kl": 5.458074390888214, + "learning_rate": 4.965363681345869e-06, + "loss": 0.24738503992557526, + "num_tokens": 2119563.0, + "reward": 0.04999999701976776, + "reward_std": 0.3974972069263458, + "rewards/reward_financial_reasoning/mean": 0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.3974972069263458, + "step": 240, + "step_time": 24.470748322499276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 61.3125, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 138.0, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 61.3125, + "completions/mean_terminated_length": 17.100000381469727, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.10774710596616206, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.715212821960449, + "kl": 3.6618697345256805, + "learning_rate": 4.96041563582385e-06, + "loss": 0.25621670484542847, + "num_tokens": 2137576.0, + "reward": 0.26250001043081284, + "reward_std": 0.3512909263372421, + "rewards/reward_financial_reasoning/mean": 0.26250001043081284, + "rewards/reward_financial_reasoning/std": 0.3512909561395645, + "step": 242, + "step_time": 48.974390029008646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 30.25, + "completions/mean_terminated_length": 30.25, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.10863757791629564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29011860489845276, + "kl": 2.333177834749222, + "learning_rate": 4.9554675903018315e-06, + "loss": 0.09401866793632507, + "num_tokens": 2157516.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 244, + "step_time": 24.383899069496692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.5, + "completions/max_terminated_length": 27.5, + "completions/mean_length": 10.25, + "completions/mean_terminated_length": 10.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.10952804986642921, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16274474561214447, + "kl": 4.739854276180267, + "learning_rate": 4.9505195447798124e-06, + "loss": 0.2652374505996704, + "num_tokens": 2175464.0, + "reward": 0.22499998658895493, + "reward_std": 0.4262783080339432, + "rewards/reward_financial_reasoning/mean": 0.22499998658895493, + "rewards/reward_financial_reasoning/std": 0.42627833783626556, + "step": 246, + "step_time": 17.72774281000602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 18.75, + "completions/mean_terminated_length": 18.75, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.11041852181656278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10518508404493332, + "kl": 3.172191321849823, + "learning_rate": 4.945571499257793e-06, + "loss": 0.12077988684177399, + "num_tokens": 2194132.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 248, + "step_time": 17.46086399950218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 11.5625, + "completions/mean_terminated_length": 11.5625, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.11130899376669635, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.09775284677743912, + "kl": 4.232122749090195, + "learning_rate": 4.940623453735775e-06, + "loss": 0.16927936673164368, + "num_tokens": 2210133.0, + "reward": 0.36250001937150955, + "reward_std": 0.14225983619689941, + "rewards/reward_financial_reasoning/mean": 0.36250001937150955, + "rewards/reward_financial_reasoning/std": 0.14225984364748, + "step": 250, + "step_time": 12.163132073997986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.625, + "completions/mean_terminated_length": 11.625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.11219946571682991, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.34086915850639343, + "kl": 2.9683316498994827, + "learning_rate": 4.935675408213756e-06, + "loss": 0.1359396129846573, + "num_tokens": 2226015.0, + "reward": 0.23750000447034836, + "reward_std": 0.3371334373950958, + "rewards/reward_financial_reasoning/mean": 0.23750000447034836, + "rewards/reward_financial_reasoning/std": 0.3371334373950958, + "step": 252, + "step_time": 13.12046697099504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 21.1875, + "completions/mean_terminated_length": 21.1875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.11308993766696349, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16412538290023804, + "kl": 3.1274770498275757, + "learning_rate": 4.930727362691737e-06, + "loss": 0.05577349662780762, + "num_tokens": 2243394.0, + "reward": 0.04999999701976776, + "reward_std": 0.3033005967736244, + "rewards/reward_financial_reasoning/mean": 0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.3033006191253662, + "step": 254, + "step_time": 15.995422225503717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 12.375, + "completions/mean_terminated_length": 12.375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.11398040961709706, + "frac_reward_zero_std": 0.25, + "grad_norm": 7.9987897872924805, + "kl": 4.347164452075958, + "learning_rate": 4.925779317169718e-06, + "loss": 0.1008201465010643, + "num_tokens": 2264448.0, + "reward": 9.313225746154785e-10, + "reward_std": 0.3648405969142914, + "rewards/reward_financial_reasoning/mean": 9.313225746154785e-10, + "rewards/reward_financial_reasoning/std": 0.3648405969142914, + "step": 256, + "step_time": 15.81367241350381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 15.0, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.11487088156723063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1827375590801239, + "kl": 3.112269103527069, + "learning_rate": 4.9208312716477e-06, + "loss": 0.11426741629838943, + "num_tokens": 2281640.0, + "reward": 0.45000001788139343, + "reward_std": 0.37416574358940125, + "rewards/reward_financial_reasoning/mean": 0.45000001788139343, + "rewards/reward_financial_reasoning/std": 0.37416577339172363, + "step": 258, + "step_time": 15.266290853498504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 21.5, + "completions/mean_terminated_length": 21.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.1157613535173642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06703684478998184, + "kl": 2.67875400185585, + "learning_rate": 4.915883226125681e-06, + "loss": 0.10653968900442123, + "num_tokens": 2302760.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 260, + "step_time": 19.27089970898669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.0, + "completions/max_terminated_length": 42.0, + "completions/mean_length": 34.0625, + "completions/mean_terminated_length": 34.0625, + "completions/min_length": 26.5, + "completions/min_terminated_length": 26.5, + "epoch": 0.11665182546749778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5481160283088684, + "kl": 2.3839191049337387, + "learning_rate": 4.910935180603662e-06, + "loss": 0.092856764793396, + "num_tokens": 2320257.0, + "reward": -0.10000000521540642, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": -0.10000000521540642, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 262, + "step_time": 20.353683264493156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.5, + "completions/max_terminated_length": 12.5, + "completions/mean_length": 10.875, + "completions/mean_terminated_length": 10.875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.11754229741763135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27290061116218567, + "kl": 2.046361565589905, + "learning_rate": 4.905987135081643e-06, + "loss": 0.08147154003381729, + "num_tokens": 2337095.0, + "reward": 0.1250000037252903, + "reward_std": 0.34743958711624146, + "rewards/reward_financial_reasoning/mean": 0.1250000037252903, + "rewards/reward_financial_reasoning/std": 0.34743958711624146, + "step": 264, + "step_time": 12.594754040492262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 142.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 31.625, + "completions/mean_terminated_length": 16.10714292526245, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.11843276936776491, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6876330375671387, + "kl": 4.0588459968566895, + "learning_rate": 4.9010390895596245e-06, + "loss": 0.24528250098228455, + "num_tokens": 2349969.0, + "reward": 0.30000001937150955, + "reward_std": 0.2920685186982155, + "rewards/reward_financial_reasoning/mean": 0.30000001937150955, + "rewards/reward_financial_reasoning/std": 0.2920685261487961, + "step": 266, + "step_time": 43.22905554099998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.5, + "completions/max_terminated_length": 15.5, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.11932324131789848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10317867994308472, + "kl": 3.0537564754486084, + "learning_rate": 4.8960910440376054e-06, + "loss": 0.12132774293422699, + "num_tokens": 2372401.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 268, + "step_time": 17.077831523005443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 9.9375, + "completions/mean_terminated_length": 9.9375, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.12021371326803205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3369079530239105, + "kl": 5.676760792732239, + "learning_rate": 4.891142998515586e-06, + "loss": 0.20450036227703094, + "num_tokens": 2382808.0, + "reward": 0.3999999985098839, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": 0.3999999985098839, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 270, + "step_time": 23.16856124699916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 16.25, + "completions/mean_terminated_length": 16.25, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.12110418521816563, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8960656523704529, + "kl": 3.670739457011223, + "learning_rate": 4.886194952993568e-06, + "loss": 0.1782083660364151, + "num_tokens": 2395636.0, + "reward": 0.10000001639127731, + "reward_std": 0.38396354019641876, + "rewards/reward_financial_reasoning/mean": 0.10000001639127731, + "rewards/reward_financial_reasoning/std": 0.38396355509757996, + "step": 272, + "step_time": 13.109228974997677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.4375, + "completions/mean_terminated_length": 15.4375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1219946571682992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08199235051870346, + "kl": 2.68540820479393, + "learning_rate": 4.881246907471549e-06, + "loss": 0.10612474381923676, + "num_tokens": 2413979.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 274, + "step_time": 15.015723587002867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.5, + "completions/max_terminated_length": 50.5, + "completions/mean_length": 32.0625, + "completions/mean_terminated_length": 32.0625, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.12288512911843277, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7122228145599365, + "kl": 1.6769566163420677, + "learning_rate": 4.87629886194953e-06, + "loss": -0.005341395735740662, + "num_tokens": 2432404.0, + "reward": 0.15000000223517418, + "reward_std": 0.1963960975408554, + "rewards/reward_financial_reasoning/mean": 0.15000000223517418, + "rewards/reward_financial_reasoning/std": 0.1963961124420166, + "step": 276, + "step_time": 23.557659125493956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 41.5, + "completions/max_terminated_length": 41.5, + "completions/mean_length": 13.5, + "completions/mean_terminated_length": 13.5, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.12377560106856635, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.11030659079551697, + "kl": 3.0595133006572723, + "learning_rate": 4.871350816427511e-06, + "loss": 0.08042588829994202, + "num_tokens": 2451300.0, + "reward": -0.08749999850988388, + "reward_std": 0.485219344496727, + "rewards/reward_financial_reasoning/mean": -0.08749999850988388, + "rewards/reward_financial_reasoning/std": 0.485219344496727, + "step": 278, + "step_time": 21.63308912049979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 103.0, + "completions/mean_length": 26.125, + "completions/mean_terminated_length": 26.125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.1246660730186999, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08378394693136215, + "kl": 2.6196550726890564, + "learning_rate": 4.866402770905493e-06, + "loss": 0.21984213590621948, + "num_tokens": 2475614.0, + "reward": 0.38750001788139343, + "reward_std": 0.39018382132053375, + "rewards/reward_financial_reasoning/mean": 0.38750001788139343, + "rewards/reward_financial_reasoning/std": 0.39018386602401733, + "step": 280, + "step_time": 41.759238430495316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 144.0, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 32.625, + "completions/mean_terminated_length": 17.00000023841858, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.12555654496883348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7665231823921204, + "kl": 2.6322261691093445, + "learning_rate": 4.861454725383474e-06, + "loss": 0.0889662653207779, + "num_tokens": 2491152.0, + "reward": 0.2500000037252903, + "reward_std": 0.32071349024772644, + "rewards/reward_financial_reasoning/mean": 0.2500000037252903, + "rewards/reward_financial_reasoning/std": 0.32071349024772644, + "step": 282, + "step_time": 45.58885034900595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 15.4375, + "completions/mean_terminated_length": 15.4375, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.12644701691896706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0762035995721817, + "kl": 4.372367188334465, + "learning_rate": 4.8565066798614556e-06, + "loss": 0.17135773599147797, + "num_tokens": 2502879.0, + "reward": 0.6749999970197678, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": 0.6749999970197678, + "rewards/reward_financial_reasoning/std": 0.24053513258695602, + "step": 284, + "step_time": 15.359563550005987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.5, + "completions/max_terminated_length": 10.5, + "completions/mean_length": 7.375, + "completions/mean_terminated_length": 7.375, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.12733748886910062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6486235857009888, + "kl": 4.935705095529556, + "learning_rate": 4.851558634339436e-06, + "loss": 0.1994798481464386, + "num_tokens": 2516885.0, + "reward": -0.125, + "reward_std": 0.40089187026023865, + "rewards/reward_financial_reasoning/mean": -0.125, + "rewards/reward_financial_reasoning/std": 0.40089187026023865, + "step": 286, + "step_time": 10.359898515005625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.5, + "completions/max_terminated_length": 39.5, + "completions/mean_length": 25.125, + "completions/mean_terminated_length": 25.125, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.12822796081923418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27073779702186584, + "kl": 3.24738085269928, + "learning_rate": 4.8466105888174175e-06, + "loss": 0.11370493471622467, + "num_tokens": 2532095.0, + "reward": 0.10000000149011612, + "reward_std": 0.37416573613882065, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.37416574358940125, + "step": 288, + "step_time": 28.717111858499266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.12911843276936777, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.002157688140869, + "kl": 3.048767626285553, + "learning_rate": 4.841662543295399e-06, + "loss": 0.11240536719560623, + "num_tokens": 2553159.0, + "reward": -0.04999999329447746, + "reward_std": 0.25354626774787903, + "rewards/reward_financial_reasoning/mean": -0.04999999329447746, + "rewards/reward_financial_reasoning/std": 0.2535462975502014, + "step": 290, + "step_time": 15.560649553492112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.5, + "completions/max_terminated_length": 27.5, + "completions/mean_length": 16.9375, + "completions/mean_terminated_length": 16.9375, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.13000890471950133, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.702836036682129, + "kl": 3.7099992632865906, + "learning_rate": 4.83671449777338e-06, + "loss": 0.12139731645584106, + "num_tokens": 2571374.0, + "reward": 0.4750000238418579, + "reward_std": 0.4641419053077698, + "rewards/reward_financial_reasoning/mean": 0.4750000238418579, + "rewards/reward_financial_reasoning/std": 0.46414193511009216, + "step": 292, + "step_time": 17.602761367503263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10.0, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 7.0625, + "completions/mean_terminated_length": 7.0625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.13089937666963491, + "frac_reward_zero_std": 1.0, + "grad_norm": 24.566186904907227, + "kl": 3.7748285830020905, + "learning_rate": 4.831766452251361e-06, + "loss": 0.1423129141330719, + "num_tokens": 2591519.0, + "reward": -0.125, + "reward_std": 0.40089187026023865, + "rewards/reward_financial_reasoning/mean": -0.125, + "rewards/reward_financial_reasoning/std": 0.40089187026023865, + "step": 294, + "step_time": 14.158596545497858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.0625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 141.5, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 31.0625, + "completions/mean_terminated_length": 15.830357551574707, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.13178984861976847, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2435134649276733, + "kl": 4.378018736839294, + "learning_rate": 4.826818406729342e-06, + "loss": 0.15879350900650024, + "num_tokens": 2606584.0, + "reward": 0.16249998658895493, + "reward_std": 0.4624329060316086, + "rewards/reward_financial_reasoning/mean": 0.16249998658895493, + "rewards/reward_financial_reasoning/std": 0.4624328762292862, + "step": 296, + "step_time": 44.203227273505036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 17.625, + "completions/mean_terminated_length": 17.625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.13268032056990206, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1087723895907402, + "kl": 2.2376338690519333, + "learning_rate": 4.821870361207324e-06, + "loss": -0.017229370772838593, + "num_tokens": 2623290.0, + "reward": 0.21250000968575478, + "reward_std": 0.20310094952583313, + "rewards/reward_financial_reasoning/mean": 0.21250000968575478, + "rewards/reward_financial_reasoning/std": 0.20310097932815552, + "step": 298, + "step_time": 15.0541494709978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 13.625, + "completions/mean_terminated_length": 13.625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.13357079252003562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38564178347587585, + "kl": 3.935918390750885, + "learning_rate": 4.816922315685305e-06, + "loss": 0.13589729368686676, + "num_tokens": 2642180.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 300, + "step_time": 16.082533718003106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 29.125, + "completions/mean_terminated_length": 14.178571701049805, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.13446126447016918, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.15317149460315704, + "kl": 2.9735162407159805, + "learning_rate": 4.811974270163286e-06, + "loss": 0.20121431350708008, + "num_tokens": 2659278.0, + "reward": 0.38750001788139343, + "reward_std": 0.39018382132053375, + "rewards/reward_financial_reasoning/mean": 0.38750001788139343, + "rewards/reward_financial_reasoning/std": 0.39018386602401733, + "step": 302, + "step_time": 45.91662111900223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 20.0625, + "completions/mean_terminated_length": 20.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.13535173642030277, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.18480771780014038, + "kl": 2.6496730744838715, + "learning_rate": 4.807026224641267e-06, + "loss": 0.13482548296451569, + "num_tokens": 2677199.0, + "reward": 0.11250000074505806, + "reward_std": 0.27998724579811096, + "rewards/reward_financial_reasoning/mean": 0.11250000074505806, + "rewards/reward_financial_reasoning/std": 0.27998724579811096, + "step": 304, + "step_time": 18.568712541000423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 13.5625, + "completions/mean_terminated_length": 13.5625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.13624220837043632, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08845102041959763, + "kl": 3.689025968313217, + "learning_rate": 4.8020781791192486e-06, + "loss": 0.16846482455730438, + "num_tokens": 2697064.0, + "reward": 0.012500002980232239, + "reward_std": 0.5139711201190948, + "rewards/reward_financial_reasoning/mean": 0.012500002980232239, + "rewards/reward_financial_reasoning/std": 0.513971135020256, + "step": 306, + "step_time": 16.80304030750267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1371326803205699, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.1612541675567627, + "kl": 2.348564103245735, + "learning_rate": 4.7971301335972295e-06, + "loss": 0.09607716649770737, + "num_tokens": 2714328.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 308, + "step_time": 15.943128220002109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.5, + "completions/max_terminated_length": 29.5, + "completions/mean_length": 18.4375, + "completions/mean_terminated_length": 18.4375, + "completions/min_length": 3.5, + "completions/min_terminated_length": 3.5, + "epoch": 0.13802315227070347, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.14036618173122406, + "kl": 4.25106380879879, + "learning_rate": 4.7921820880752105e-06, + "loss": 0.06849891692399979, + "num_tokens": 2729847.0, + "reward": 0.08750001154839993, + "reward_std": 0.33053525537252426, + "rewards/reward_financial_reasoning/mean": 0.08750001154839993, + "rewards/reward_financial_reasoning/std": 0.33053525537252426, + "step": 310, + "step_time": 16.476932208006474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.5, + "completions/max_terminated_length": 11.5, + "completions/mean_length": 8.5, + "completions/mean_terminated_length": 8.5, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.13891362422083706, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.081897735595703, + "kl": 2.9397522509098053, + "learning_rate": 4.787234042553192e-06, + "loss": 0.1418164074420929, + "num_tokens": 2745783.0, + "reward": -0.012499995529651642, + "reward_std": 0.5226411670446396, + "rewards/reward_financial_reasoning/mean": -0.012499995529651642, + "rewards/reward_financial_reasoning/std": 0.5226411670446396, + "step": 312, + "step_time": 12.007526194989623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.3125, + "completions/mean_terminated_length": 13.3125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.13980409617097062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1045272946357727, + "kl": 2.600666582584381, + "learning_rate": 4.782285997031173e-06, + "loss": 0.09802389144897461, + "num_tokens": 2761556.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 314, + "step_time": 12.98709403751127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 139.5, + "completions/max_terminated_length": 78.5, + "completions/mean_length": 38.0, + "completions/mean_terminated_length": 24.125, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.14069456812110417, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.935442328453064, + "kl": 4.464976981282234, + "learning_rate": 4.777337951509154e-06, + "loss": 0.09435869753360748, + "num_tokens": 2781652.0, + "reward": -0.08750000037252903, + "reward_std": 0.18624438345432281, + "rewards/reward_financial_reasoning/mean": -0.08750000037252903, + "rewards/reward_financial_reasoning/std": 0.1862443909049034, + "step": 316, + "step_time": 49.13465615150926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.8125, + "completions/clipped_ratio": 0.125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 41.8125, + "completions/mean_terminated_length": 11.214286088943481, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.14158504007123776, + "frac_reward_zero_std": 0.5, + "grad_norm": 6.196547985076904, + "kl": 3.587283805012703, + "learning_rate": 4.772389905987135e-06, + "loss": 0.14025059342384338, + "num_tokens": 2797705.0, + "reward": 0.050000011920928955, + "reward_std": 0.42308472096920013, + "rewards/reward_financial_reasoning/mean": 0.050000011920928955, + "rewards/reward_financial_reasoning/std": 0.4230847507715225, + "step": 318, + "step_time": 78.31156018701222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.75, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 24.75, + "completions/mean_terminated_length": 9.142857313156128, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.14247551202137132, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.3096694946289062, + "kl": 5.908802837133408, + "learning_rate": 4.767441860465117e-06, + "loss": 0.3304884731769562, + "num_tokens": 2815309.0, + "reward": 0.625, + "reward_std": 0.40620189905166626, + "rewards/reward_financial_reasoning/mean": 0.625, + "rewards/reward_financial_reasoning/std": 0.40620194375514984, + "step": 320, + "step_time": 46.993786041497515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.1433659839715049, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24115696549415588, + "kl": 4.057637929916382, + "learning_rate": 4.762493814943098e-06, + "loss": 0.14424464106559753, + "num_tokens": 2827581.0, + "reward": 0.09999999962747097, + "reward_std": 0.37416573613882065, + "rewards/reward_financial_reasoning/mean": 0.09999999962747097, + "rewards/reward_financial_reasoning/std": 0.37416573613882065, + "step": 322, + "step_time": 12.003390189493075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 12.0625, + "completions/mean_terminated_length": 12.0625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.14425645592163847, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.445495843887329, + "kl": 4.4983391761779785, + "learning_rate": 4.757545769421079e-06, + "loss": 0.0963294506072998, + "num_tokens": 2849382.0, + "reward": 0.0625000149011612, + "reward_std": 0.28618598729372025, + "rewards/reward_financial_reasoning/mean": 0.0625000149011612, + "rewards/reward_financial_reasoning/std": 0.28618600964546204, + "step": 324, + "step_time": 18.020186542500596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 30.125, + "completions/mean_terminated_length": 30.125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.14514692787177205, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5247211456298828, + "kl": 2.5776854157447815, + "learning_rate": 4.75259772389906e-06, + "loss": 0.1263631284236908, + "num_tokens": 2870536.0, + "reward": 0.08750000223517418, + "reward_std": 0.1642080545425415, + "rewards/reward_financial_reasoning/mean": 0.08750000223517418, + "rewards/reward_financial_reasoning/std": 0.1642080694437027, + "step": 326, + "step_time": 41.201073117990745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.5, + "completions/max_terminated_length": 29.5, + "completions/mean_length": 15.3125, + "completions/mean_terminated_length": 15.3125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.1460373998219056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31851768493652344, + "kl": 4.2002551555633545, + "learning_rate": 4.7476496783770416e-06, + "loss": 0.1324121206998825, + "num_tokens": 2879789.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 328, + "step_time": 13.552819680997345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 20.875, + "completions/mean_terminated_length": 20.875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.14692787177203917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08847746253013611, + "kl": 4.410182029008865, + "learning_rate": 4.7427016328550225e-06, + "loss": 0.1517459750175476, + "num_tokens": 2901283.0, + "reward": 0.32500000670552254, + "reward_std": 0.24053511023521423, + "rewards/reward_financial_reasoning/mean": 0.32500000670552254, + "rewards/reward_financial_reasoning/std": 0.24053512513637543, + "step": 330, + "step_time": 21.379759306495544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.1875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 133.5, + "completions/max_terminated_length": 10.0, + "completions/mean_length": 23.1875, + "completions/mean_terminated_length": 7.517857313156128, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.14781834372217276, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.072896003723145, + "kl": 5.56328746676445, + "learning_rate": 4.7377535873330035e-06, + "loss": 0.12063997983932495, + "num_tokens": 2914286.0, + "reward": 0.18750000558793545, + "reward_std": 0.3389529511332512, + "rewards/reward_financial_reasoning/mean": 0.18750000558793545, + "rewards/reward_financial_reasoning/std": 0.3389529809355736, + "step": 332, + "step_time": 43.318069253498834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.14870881567230632, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5086295008659363, + "kl": 2.4566811472177505, + "learning_rate": 4.732805541810985e-06, + "loss": 0.0956970602273941, + "num_tokens": 2933166.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 334, + "step_time": 16.838145778496255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.1875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 141.5, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 35.1875, + "completions/mean_terminated_length": 20.10714340209961, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1495992876224399, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.059113502502441, + "kl": 3.152929961681366, + "learning_rate": 4.727857496288966e-06, + "loss": 0.1890815645456314, + "num_tokens": 2951673.0, + "reward": 0.11250000447034836, + "reward_std": 0.31000544875860214, + "rewards/reward_financial_reasoning/mean": 0.11250000447034836, + "rewards/reward_financial_reasoning/std": 0.3100054860115051, + "step": 336, + "step_time": 50.63826839199464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.4375, + "completions/mean_terminated_length": 12.4375, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.15048975957257346, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.7672529220581055, + "kl": 2.8252325542271137, + "learning_rate": 4.722909450766948e-06, + "loss": 0.07159600406885147, + "num_tokens": 2967208.0, + "reward": 0.16249998658895493, + "reward_std": 0.4624329060316086, + "rewards/reward_financial_reasoning/mean": 0.16249998658895493, + "rewards/reward_financial_reasoning/std": 0.4624328762292862, + "step": 338, + "step_time": 14.010528551505558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.15138023152270705, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.698003768920898, + "kl": 4.801441490650177, + "learning_rate": 4.717961405244928e-06, + "loss": 0.19205763936042786, + "num_tokens": 2983600.0, + "reward": -0.12500000186264515, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.12500000186264515, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 340, + "step_time": 14.301874094995583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.1522707034728406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16439253091812134, + "kl": 5.160941481590271, + "learning_rate": 4.71301335972291e-06, + "loss": 0.1990242302417755, + "num_tokens": 2999064.0, + "reward": -0.07500000298023224, + "reward_std": 0.45434411615133286, + "rewards/reward_financial_reasoning/mean": -0.07500000298023224, + "rewards/reward_financial_reasoning/std": 0.45434411615133286, + "step": 342, + "step_time": 13.344863591006288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 8.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 8.0, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.15316117542297417, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.908713340759277, + "kl": 5.427981078624725, + "learning_rate": 4.708065314200891e-06, + "loss": 0.20119166374206543, + "num_tokens": 3010456.0, + "reward": 0.7125000208616257, + "reward_std": 0.22243821248412132, + "rewards/reward_financial_reasoning/mean": 0.7125000208616257, + "rewards/reward_financial_reasoning/std": 0.22243822365999222, + "step": 344, + "step_time": 10.042046501504956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 17.375, + "completions/mean_terminated_length": 17.375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.15405164737310775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10223693400621414, + "kl": 3.115888088941574, + "learning_rate": 4.703117268678873e-06, + "loss": 0.12509028613567352, + "num_tokens": 3027478.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 346, + "step_time": 15.962319723501423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 21.5625, + "completions/mean_terminated_length": 21.5625, + "completions/min_length": 18.5, + "completions/min_terminated_length": 18.5, + "epoch": 0.1549421193232413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2353672981262207, + "kl": 2.3216539919376373, + "learning_rate": 4.698169223156854e-06, + "loss": 0.09396439045667648, + "num_tokens": 3046503.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 348, + "step_time": 17.60655723549644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 24.4375, + "completions/mean_terminated_length": 24.4375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1558325912733749, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0838085487484932, + "kl": 3.7829702496528625, + "learning_rate": 4.6932211776348345e-06, + "loss": 0.1546584516763687, + "num_tokens": 3064462.0, + "reward": 0.0, + "reward_std": 0.32293298840522766, + "rewards/reward_financial_reasoning/mean": 0.0, + "rewards/reward_financial_reasoning/std": 0.32293298840522766, + "step": 350, + "step_time": 19.438781550499698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 13.875, + "completions/mean_terminated_length": 13.875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.15672306322350846, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.361366271972656, + "kl": 3.5697886049747467, + "learning_rate": 4.688273132112816e-06, + "loss": 0.024487487971782684, + "num_tokens": 3085988.0, + "reward": 0.16249999962747097, + "reward_std": 0.327665738761425, + "rewards/reward_financial_reasoning/mean": 0.16249999962747097, + "rewards/reward_financial_reasoning/std": 0.3276657685637474, + "step": 352, + "step_time": 16.988179097501416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 15.75, + "completions/mean_terminated_length": 15.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.15761353517364202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11495406925678253, + "kl": 2.6656560599803925, + "learning_rate": 4.683325086590797e-06, + "loss": 0.10649028420448303, + "num_tokens": 3108080.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 354, + "step_time": 17.6394682640057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.5, + "completions/max_terminated_length": 29.5, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1585040071237756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16622397303581238, + "kl": 2.5759568214416504, + "learning_rate": 4.678377041068778e-06, + "loss": 0.104760080575943, + "num_tokens": 3123224.0, + "reward": 0.17500000074505806, + "reward_std": 0.29398736357688904, + "rewards/reward_financial_reasoning/mean": 0.17500000074505806, + "rewards/reward_financial_reasoning/std": 0.29398736357688904, + "step": 356, + "step_time": 16.088501062498835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 15.4375, + "completions/mean_terminated_length": 15.4375, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.15939447907390916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4403320550918579, + "kl": 2.555238038301468, + "learning_rate": 4.673428995546759e-06, + "loss": 0.08659804612398148, + "num_tokens": 3145647.0, + "reward": 0.3750000149011612, + "reward_std": 0.45434410870075226, + "rewards/reward_financial_reasoning/mean": 0.3750000149011612, + "rewards/reward_financial_reasoning/std": 0.45434412360191345, + "step": 358, + "step_time": 19.564347899991844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 20.1875, + "completions/mean_terminated_length": 20.1875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.16028495102404275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5063008069992065, + "kl": 2.5525257289409637, + "learning_rate": 4.668480950024741e-06, + "loss": 0.10185902565717697, + "num_tokens": 3166106.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 360, + "step_time": 17.824098634006077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.25, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 137.5, + "completions/max_terminated_length": 56.5, + "completions/mean_length": 36.25, + "completions/mean_terminated_length": 22.375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1611754229741763, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8783189058303833, + "kl": 3.123973160982132, + "learning_rate": 4.663532904502722e-06, + "loss": 0.15637540817260742, + "num_tokens": 3186542.0, + "reward": -0.0875000013038516, + "reward_std": 0.266422763466835, + "rewards/reward_financial_reasoning/mean": -0.0875000013038516, + "rewards/reward_financial_reasoning/std": 0.266422763466835, + "step": 362, + "step_time": 51.280559887498384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.5, + "completions/max_terminated_length": 15.5, + "completions/mean_length": 10.625, + "completions/mean_terminated_length": 10.625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.1620658949243099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6702510714530945, + "kl": 5.710592150688171, + "learning_rate": 4.658584858980703e-06, + "loss": 0.21924875676631927, + "num_tokens": 3203200.0, + "reward": -0.30000000447034836, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -0.30000000447034836, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 364, + "step_time": 13.775912964996678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.16295636687444345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2933957278728485, + "kl": 6.98069429397583, + "learning_rate": 4.653636813458684e-06, + "loss": 0.2750336825847626, + "num_tokens": 3223940.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 366, + "step_time": 17.495442778003053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.5, + "completions/max_terminated_length": 44.5, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 28.0, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.163846838824577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2866394519805908, + "kl": 3.2483417838811874, + "learning_rate": 4.648688767936666e-06, + "loss": 0.11545059829950333, + "num_tokens": 3246244.0, + "reward": 0.22500000149011612, + "reward_std": 0.34743960946798325, + "rewards/reward_financial_reasoning/mean": 0.22500000149011612, + "rewards/reward_financial_reasoning/std": 0.34743963181972504, + "step": 368, + "step_time": 25.16599783250058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 12.75, + "completions/mean_terminated_length": 12.75, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.1647373107747106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18002893030643463, + "kl": 3.9817181825637817, + "learning_rate": 4.643740722414647e-06, + "loss": 0.1207280308008194, + "num_tokens": 3263880.0, + "reward": -0.22500000149011612, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": -0.22500000149011612, + "rewards/reward_financial_reasoning/std": 0.18708287179470062, + "step": 370, + "step_time": 15.068639872995846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.5, + "completions/max_terminated_length": 17.5, + "completions/mean_length": 13.0625, + "completions/mean_terminated_length": 13.0625, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.16562778272484416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23808419704437256, + "kl": 4.9848949164152145, + "learning_rate": 4.6387926768926275e-06, + "loss": 0.1741630584001541, + "num_tokens": 3276721.0, + "reward": 0.6749999970197678, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": 0.6749999970197678, + "rewards/reward_financial_reasoning/std": 0.24053513258695602, + "step": 372, + "step_time": 11.880487833997904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 22.3125, + "completions/mean_terminated_length": 22.3125, + "completions/min_length": 19.5, + "completions/min_terminated_length": 19.5, + "epoch": 0.16651825467497774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23066724836826324, + "kl": 2.4786416590213776, + "learning_rate": 4.633844631370609e-06, + "loss": 0.09922318905591965, + "num_tokens": 3297198.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 374, + "step_time": 18.64603684699614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 38.1875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.5, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 38.1875, + "completions/mean_terminated_length": 24.455358505249023, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.1674087266251113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3857172429561615, + "kl": 2.536154806613922, + "learning_rate": 4.62889658584859e-06, + "loss": 0.08426443487405777, + "num_tokens": 3326601.0, + "reward": -0.20000000298023224, + "reward_std": 0.32071349024772644, + "rewards/reward_financial_reasoning/mean": -0.20000000298023224, + "rewards/reward_financial_reasoning/std": 0.32071349024772644, + "step": 376, + "step_time": 58.15682624900728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.5, + "completions/max_terminated_length": 13.5, + "completions/mean_length": 11.4375, + "completions/mean_terminated_length": 11.4375, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.1682991985752449, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.25235873460769653, + "kl": 4.639036536216736, + "learning_rate": 4.623948540326571e-06, + "loss": 0.20123405754566193, + "num_tokens": 3338424.0, + "reward": 0.02500000037252903, + "reward_std": 0.2121320366859436, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.2121320366859436, + "step": 378, + "step_time": 10.856103686997812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 19.125, + "completions/mean_terminated_length": 19.125, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.16918967052537845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11810299009084702, + "kl": 2.776392698287964, + "learning_rate": 4.619000494804552e-06, + "loss": 0.10224653035402298, + "num_tokens": 3350586.0, + "reward": -0.22500000149011612, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": -0.22500000149011612, + "rewards/reward_financial_reasoning/std": 0.18708287179470062, + "step": 380, + "step_time": 14.638130175502738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.5, + "completions/max_terminated_length": 43.5, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.170080142475512, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.19874098896980286, + "kl": 2.42767196893692, + "learning_rate": 4.614052449282534e-06, + "loss": 0.05534215644001961, + "num_tokens": 3367010.0, + "reward": 0.2000000085681677, + "reward_std": 0.3259558826684952, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.32595589756965637, + "step": 382, + "step_time": 21.40752187149701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 17.8125, + "completions/mean_terminated_length": 17.8125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.1709706144256456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36154139041900635, + "kl": 3.0273653864860535, + "learning_rate": 4.609104403760515e-06, + "loss": 0.11987186968326569, + "num_tokens": 3383775.0, + "reward": -0.22500000149011612, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": -0.22500000149011612, + "rewards/reward_financial_reasoning/std": 0.18708287179470062, + "step": 384, + "step_time": 16.326452354001958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 12.1875, + "completions/mean_terminated_length": 12.1875, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.17186108637577915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19802482426166534, + "kl": 3.4667557179927826, + "learning_rate": 4.604156358238496e-06, + "loss": 0.13101685047149658, + "num_tokens": 3404386.0, + "reward": -0.12500000186264515, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.12500000186264515, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 386, + "step_time": 16.66828526950121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 16.75, + "completions/mean_terminated_length": 16.75, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.17275155832591274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17383332550525665, + "kl": 2.8899713158607483, + "learning_rate": 4.599208312716477e-06, + "loss": 0.10997433215379715, + "num_tokens": 3426190.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 388, + "step_time": 19.155468865996227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 53.1875, + "completions/clipped_ratio": 0.125, + "completions/max_length": 151.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 53.1875, + "completions/mean_terminated_length": 23.479166984558105, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.1736420302760463, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.8917670249938965, + "kl": 3.196235477924347, + "learning_rate": 4.594260267194459e-06, + "loss": 0.10975559055805206, + "num_tokens": 3448657.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 390, + "step_time": 55.84161307200702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 35.125, + "completions/mean_terminated_length": 20.500000953674316, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1745325022261799, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.013735055923462, + "kl": 3.2146300077438354, + "learning_rate": 4.5893122216724396e-06, + "loss": 0.24231016635894775, + "num_tokens": 3465963.0, + "reward": 0.02499999850988388, + "reward_std": 0.4057116433978081, + "rewards/reward_financial_reasoning/mean": 0.02499999850988388, + "rewards/reward_financial_reasoning/std": 0.40571165084838867, + "step": 392, + "step_time": 48.684688639499655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 132.5, + "completions/max_terminated_length": 16.5, + "completions/mean_length": 27.375, + "completions/mean_terminated_length": 12.4375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.17542297417631345, + "frac_reward_zero_std": 1.0, + "grad_norm": 21.446544647216797, + "kl": 3.7770427837967873, + "learning_rate": 4.584364176150421e-06, + "loss": 0.14160798490047455, + "num_tokens": 3482145.0, + "reward": 0.12500000558793545, + "reward_std": 0.34743960946798325, + "rewards/reward_financial_reasoning/mean": 0.12500000558793545, + "rewards/reward_financial_reasoning/std": 0.34743960946798325, + "step": 394, + "step_time": 46.83634573549716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 14.125, + "completions/mean_terminated_length": 14.125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.176313446126447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30426764488220215, + "kl": 2.9290217757225037, + "learning_rate": 4.579416130628402e-06, + "loss": 0.10413938015699387, + "num_tokens": 3495651.0, + "reward": 0.45000001788139343, + "reward_std": 0.37416574358940125, + "rewards/reward_financial_reasoning/mean": 0.45000001788139343, + "rewards/reward_financial_reasoning/std": 0.37416577339172363, + "step": 396, + "step_time": 13.700383353498182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 20.0625, + "completions/mean_terminated_length": 20.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1772039180765806, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.12722289562225342, + "kl": 2.9362562894821167, + "learning_rate": 4.574468085106383e-06, + "loss": 0.10540154576301575, + "num_tokens": 3517420.0, + "reward": 0.06250000093132257, + "reward_std": 0.1060660183429718, + "rewards/reward_financial_reasoning/mean": 0.06250000093132257, + "rewards/reward_financial_reasoning/std": 0.1060660183429718, + "step": 398, + "step_time": 21.081551555500482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 78.625, + "completions/clipped_ratio": 0.25, + "completions/max_length": 143.5, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 78.625, + "completions/mean_terminated_length": 20.375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.17809439002671415, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.1222035884857178, + "kl": 4.669141083955765, + "learning_rate": 4.569520039584365e-06, + "loss": 0.18662548065185547, + "num_tokens": 3539174.0, + "reward": 0.08749999664723873, + "reward_std": 0.3001621440052986, + "rewards/reward_financial_reasoning/mean": 0.08749999664723873, + "rewards/reward_financial_reasoning/std": 0.3001621440052986, + "step": 400, + "step_time": 53.09678938449724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.17898486197684774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3273642659187317, + "kl": 3.055632010102272, + "learning_rate": 4.564571994062346e-06, + "loss": 0.12121474742889404, + "num_tokens": 3558926.0, + "reward": -0.15000000596046448, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": -0.15000000596046448, + "rewards/reward_financial_reasoning/std": 0.26726125180721283, + "step": 402, + "step_time": 16.527376137997635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 61.25, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 145.0, + "completions/max_terminated_length": 27.5, + "completions/mean_length": 61.25, + "completions/mean_terminated_length": 16.625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.1798753339269813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6433550119400024, + "kl": 2.9147322699427605, + "learning_rate": 4.559623948540327e-06, + "loss": 0.09967145323753357, + "num_tokens": 3577754.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 404, + "step_time": 51.886354368994944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 7.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8.5, + "completions/max_terminated_length": 8.5, + "completions/mean_length": 7.5, + "completions/mean_terminated_length": 7.5, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.18076580587711488, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7596104741096497, + "kl": 5.5971426367759705, + "learning_rate": 4.554675903018308e-06, + "loss": 0.22060047090053558, + "num_tokens": 3591514.0, + "reward": 0.0, + "reward_std": 0.1963960975408554, + "rewards/reward_financial_reasoning/mean": 0.0, + "rewards/reward_financial_reasoning/std": 0.1963961124420166, + "step": 406, + "step_time": 10.784611634997418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 11.0, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.18165627782724844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2548048496246338, + "kl": 2.3614984899759293, + "learning_rate": 4.54972785749629e-06, + "loss": 0.09047777950763702, + "num_tokens": 3603050.0, + "reward": -0.20000000298023224, + "reward_std": 0.32071349024772644, + "rewards/reward_financial_reasoning/mean": -0.20000000298023224, + "rewards/reward_financial_reasoning/std": 0.32071349024772644, + "step": 408, + "step_time": 10.890331844006141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.8125, + "completions/clipped_ratio": 0.125, + "completions/max_length": 146.0, + "completions/max_terminated_length": 27.5, + "completions/mean_length": 48.8125, + "completions/mean_terminated_length": 19.08333396911621, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.182546749777382, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.14107643067836761, + "kl": 3.473247766494751, + "learning_rate": 4.544779811974271e-06, + "loss": 0.24801112711429596, + "num_tokens": 3625639.0, + "reward": 0.0, + "reward_std": 0.35675284266471863, + "rewards/reward_financial_reasoning/mean": 0.0, + "rewards/reward_financial_reasoning/std": 0.3567528575658798, + "step": 410, + "step_time": 54.626304682009504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 19.5, + "completions/min_terminated_length": 19.5, + "epoch": 0.1834372217275156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7728067636489868, + "kl": 2.5951511412858963, + "learning_rate": 4.539831766452252e-06, + "loss": 0.10369659960269928, + "num_tokens": 3649347.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 412, + "step_time": 19.267440424002416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.5, + "completions/max_terminated_length": 26.5, + "completions/mean_length": 19.8125, + "completions/mean_terminated_length": 19.8125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.18432769367764915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2641366422176361, + "kl": 2.1459864526987076, + "learning_rate": 4.5348837209302326e-06, + "loss": 0.08479943871498108, + "num_tokens": 3661528.0, + "reward": 0.45000001788139343, + "reward_std": 0.37416574358940125, + "rewards/reward_financial_reasoning/mean": 0.45000001788139343, + "rewards/reward_financial_reasoning/std": 0.37416577339172363, + "step": 414, + "step_time": 13.814018774002761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 13.0625, + "completions/mean_terminated_length": 13.0625, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.18521816562778273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4945250153541565, + "kl": 2.9109133034944534, + "learning_rate": 4.529935675408214e-06, + "loss": 0.11534873396158218, + "num_tokens": 3683425.0, + "reward": 0.02499999850988388, + "reward_std": 0.45434408634901047, + "rewards/reward_financial_reasoning/mean": 0.02499999850988388, + "rewards/reward_financial_reasoning/std": 0.45434409379959106, + "step": 416, + "step_time": 17.516358232995117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 46.5625, + "completions/mean_terminated_length": 46.5625, + "completions/min_length": 26.5, + "completions/min_terminated_length": 26.5, + "epoch": 0.1861086375779163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4000868797302246, + "kl": 2.132822684943676, + "learning_rate": 4.524987629886195e-06, + "loss": 0.08503614366054535, + "num_tokens": 3699290.0, + "reward": 0.07499999552965164, + "reward_std": 0.40089186280965805, + "rewards/reward_financial_reasoning/mean": 0.07499999552965164, + "rewards/reward_financial_reasoning/std": 0.40089187026023865, + "step": 418, + "step_time": 25.094515656495787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.8125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 140.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 30.8125, + "completions/mean_terminated_length": 15.785714626312256, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.18699910952804988, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.17446395754814148, + "kl": 3.0325257033109665, + "learning_rate": 4.520039584364176e-06, + "loss": 0.2618591785430908, + "num_tokens": 3718951.0, + "reward": 0.2875000089406967, + "reward_std": 0.4670701175928116, + "rewards/reward_financial_reasoning/mean": 0.2875000089406967, + "rewards/reward_financial_reasoning/std": 0.4670701324939728, + "step": 420, + "step_time": 51.2821328200007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 9.3125, + "completions/mean_terminated_length": 9.3125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.18788958147818344, + "frac_reward_zero_std": 0.5, + "grad_norm": 8.17707347869873, + "kl": 7.205821305513382, + "learning_rate": 4.515091538842158e-06, + "loss": 0.450656533241272, + "num_tokens": 3734572.0, + "reward": 0.11250001192092896, + "reward_std": 0.4884578585624695, + "rewards/reward_financial_reasoning/mean": 0.11250001192092896, + "rewards/reward_financial_reasoning/std": 0.48845788836479187, + "step": 422, + "step_time": 16.285566200003814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 12.125, + "completions/mean_terminated_length": 12.125, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.188780053428317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2911452353000641, + "kl": 4.378904566168785, + "learning_rate": 4.510143493320139e-06, + "loss": 0.14741505682468414, + "num_tokens": 3750030.0, + "reward": 0.20000000670552254, + "reward_std": 0.26726123690605164, + "rewards/reward_financial_reasoning/mean": 0.20000000670552254, + "rewards/reward_financial_reasoning/std": 0.26726123690605164, + "step": 424, + "step_time": 15.573336581994226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.5, + "completions/max_terminated_length": 16.5, + "completions/mean_length": 11.6875, + "completions/mean_terminated_length": 11.6875, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.18967052537845058, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.824861764907837, + "kl": 5.281881749629974, + "learning_rate": 4.50519544779812e-06, + "loss": 0.2081257700920105, + "num_tokens": 3764001.0, + "reward": 0.25, + "reward_std": 0.6948792338371277, + "rewards/reward_financial_reasoning/mean": 0.25, + "rewards/reward_financial_reasoning/std": 0.6948792338371277, + "step": 426, + "step_time": 13.06527343300695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12.5, + "completions/max_terminated_length": 12.5, + "completions/mean_length": 10.0625, + "completions/mean_terminated_length": 10.0625, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.19056099732858414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.71004319190979, + "kl": 3.232193171977997, + "learning_rate": 4.500247402276101e-06, + "loss": 0.12972982227802277, + "num_tokens": 3779722.0, + "reward": 0.1250000037252903, + "reward_std": 0.34743958711624146, + "rewards/reward_financial_reasoning/mean": 0.1250000037252903, + "rewards/reward_financial_reasoning/std": 0.34743958711624146, + "step": 428, + "step_time": 12.194943665497703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.5, + "completions/max_terminated_length": 30.5, + "completions/mean_length": 18.125, + "completions/mean_terminated_length": 18.125, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.19145146927871773, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1838688850402832, + "kl": 2.4279059320688248, + "learning_rate": 4.495299356754083e-06, + "loss": 0.0850219801068306, + "num_tokens": 3793124.0, + "reward": 0.4999999888241291, + "reward_std": 0.05345224589109421, + "rewards/reward_financial_reasoning/mean": 0.4999999888241291, + "rewards/reward_financial_reasoning/std": 0.05345224589109421, + "step": 430, + "step_time": 16.18135965249894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 15.3125, + "completions/mean_terminated_length": 15.3125, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.1923419412288513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6881890892982483, + "kl": 3.365168124437332, + "learning_rate": 4.490351311232064e-06, + "loss": 0.1304081380367279, + "num_tokens": 3814681.0, + "reward": -0.125, + "reward_std": 0.29398736357688904, + "rewards/reward_financial_reasoning/mean": -0.125, + "rewards/reward_financial_reasoning/std": 0.29398736357688904, + "step": 432, + "step_time": 18.796456829495582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.3125, + "completions/mean_terminated_length": 20.3125, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.19323241317898487, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.092527389526367, + "kl": 2.5298050343990326, + "learning_rate": 4.485403265710045e-06, + "loss": 0.09655264765024185, + "num_tokens": 3831206.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 434, + "step_time": 16.56057541100381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 134.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 10.535714626312256, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.19412288512911843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.42697274684906, + "kl": 2.7040752321481705, + "learning_rate": 4.480455220188026e-06, + "loss": 0.14201709628105164, + "num_tokens": 3845242.0, + "reward": 0.23750001192092896, + "reward_std": 0.5119454711675644, + "rewards/reward_financial_reasoning/mean": 0.23750001192092896, + "rewards/reward_financial_reasoning/std": 0.5119454860687256, + "step": 436, + "step_time": 43.56783085800271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.5, + "completions/max_terminated_length": 33.5, + "completions/mean_length": 21.8125, + "completions/mean_terminated_length": 21.8125, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.195013357079252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.33076462149620056, + "kl": 1.9392491281032562, + "learning_rate": 4.475507174666007e-06, + "loss": 0.06251571327447891, + "num_tokens": 3858575.0, + "reward": 0.1250000037252903, + "reward_std": 0.34743958711624146, + "rewards/reward_financial_reasoning/mean": 0.1250000037252903, + "rewards/reward_financial_reasoning/std": 0.34743958711624146, + "step": 438, + "step_time": 16.1721732709957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 64.6875, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 150.0, + "completions/max_terminated_length": 50.5, + "completions/mean_length": 64.6875, + "completions/mean_terminated_length": 21.225000381469727, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.19590382902938558, + "frac_reward_zero_std": 1.0, + "grad_norm": 971.8073120117188, + "kl": 15.128244251012802, + "learning_rate": 4.470559129143988e-06, + "loss": 1.0460975170135498, + "num_tokens": 3892042.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 440, + "step_time": 63.478058283501014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.19679430097951914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16787387430667877, + "kl": 4.265814155340195, + "learning_rate": 4.465611083621969e-06, + "loss": 0.149496391415596, + "num_tokens": 3907522.0, + "reward": 0.0, + "reward_std": 0.37416573613882065, + "rewards/reward_financial_reasoning/mean": 0.0, + "rewards/reward_financial_reasoning/std": 0.37416573613882065, + "step": 442, + "step_time": 16.013422278996586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 24.375, + "completions/mean_terminated_length": 24.375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.19768477292965272, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.1105194091796875, + "kl": 2.2890444099903107, + "learning_rate": 4.460663038099951e-06, + "loss": 0.10768984258174896, + "num_tokens": 3926016.0, + "reward": 1.4901161193847656e-08, + "reward_std": 0.27705904096364975, + "rewards/reward_financial_reasoning/mean": 1.4901161193847656e-08, + "rewards/reward_financial_reasoning/std": 0.27705905586481094, + "step": 444, + "step_time": 20.03814392700224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.5, + "completions/max_terminated_length": 48.5, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 23.5, + "completions/min_terminated_length": 23.5, + "epoch": 0.19857524487978628, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.598520278930664, + "kl": 2.6320609748363495, + "learning_rate": 4.455714992577932e-06, + "loss": 0.11514586955308914, + "num_tokens": 3946592.0, + "reward": 0.32500001043081284, + "reward_std": 0.38347896933555603, + "rewards/reward_financial_reasoning/mean": 0.32500001043081284, + "rewards/reward_financial_reasoning/std": 0.3834789991378784, + "step": 446, + "step_time": 25.14641531599409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 133.5, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 12.116071701049805, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.19946571682991987, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.15830379724502563, + "kl": 2.5864310264587402, + "learning_rate": 4.450766947055914e-06, + "loss": 0.17798259854316711, + "num_tokens": 3967716.0, + "reward": -0.0875000013038516, + "reward_std": 0.266422763466835, + "rewards/reward_financial_reasoning/mean": -0.0875000013038516, + "rewards/reward_financial_reasoning/std": 0.266422763466835, + "step": 448, + "step_time": 49.958261707997735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.3125, + "completions/clipped_ratio": 0.125, + "completions/max_length": 133.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 43.3125, + "completions/mean_terminated_length": 13.833333969116211, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.20035618878005343, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.354418754577637, + "kl": 3.437360942363739, + "learning_rate": 4.445818901533894e-06, + "loss": 0.10762486606836319, + "num_tokens": 3985121.0, + "reward": 0.25000000558793545, + "reward_std": 0.31163340061903, + "rewards/reward_financial_reasoning/mean": 0.25000000558793545, + "rewards/reward_financial_reasoning/std": 0.31163340061903, + "step": 450, + "step_time": 47.905381616998056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.5, + "completions/max_terminated_length": 26.5, + "completions/mean_length": 32.4375, + "completions/mean_terminated_length": 17.705357551574707, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.201246660730187, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.903286457061768, + "kl": 2.700325161218643, + "learning_rate": 4.440870856011876e-06, + "loss": 0.1732444018125534, + "num_tokens": 4005176.0, + "reward": 0.15000000223517418, + "reward_std": 0.1963960975408554, + "rewards/reward_financial_reasoning/mean": 0.15000000223517418, + "rewards/reward_financial_reasoning/std": 0.1963961124420166, + "step": 452, + "step_time": 49.246475757499866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.0, + "completions/max_terminated_length": 30.0, + "completions/mean_length": 22.9375, + "completions/mean_terminated_length": 22.9375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.20213713268032057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10897312313318253, + "kl": 2.5111473351716995, + "learning_rate": 4.435922810489857e-06, + "loss": 0.10052955150604248, + "num_tokens": 4027079.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 454, + "step_time": 20.917127861499466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 18.0625, + "completions/mean_terminated_length": 18.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.20302760463045413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2660609483718872, + "kl": 3.365581303834915, + "learning_rate": 4.4309747649678384e-06, + "loss": 0.1130051389336586, + "num_tokens": 4044392.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 456, + "step_time": 17.129023396002594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 14.375, + "completions/mean_terminated_length": 14.375, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.20391807658058772, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14781758189201355, + "kl": 2.1308258026838303, + "learning_rate": 4.426026719445819e-06, + "loss": 0.08490869402885437, + "num_tokens": 4061966.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 458, + "step_time": 14.371157450499595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 26.125, + "completions/mean_terminated_length": 26.125, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.20480854853072128, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6753233671188354, + "kl": 6.325722843408585, + "learning_rate": 4.4210786739238e-06, + "loss": 0.24554677307605743, + "num_tokens": 4079328.0, + "reward": 0.04999999701976776, + "reward_std": 0.3033005967736244, + "rewards/reward_financial_reasoning/mean": 0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.3033006191253662, + "step": 460, + "step_time": 17.608918988491496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.5, + "completions/max_terminated_length": 27.5, + "completions/mean_length": 36.4375, + "completions/mean_terminated_length": 21.91964340209961, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.20569902048085487, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.20720401406288147, + "kl": 1.656369112432003, + "learning_rate": 4.416130628401782e-06, + "loss": 0.13556653261184692, + "num_tokens": 4099591.0, + "reward": -0.01249999925494194, + "reward_std": 0.12464234232902527, + "rewards/reward_financial_reasoning/mean": -0.01249999925494194, + "rewards/reward_financial_reasoning/std": 0.12464234232902527, + "step": 462, + "step_time": 50.19382210749973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 13.125, + "completions/mean_terminated_length": 13.125, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.20658949243098843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4629965126514435, + "kl": 2.888270825147629, + "learning_rate": 4.411182582879763e-06, + "loss": 0.10317643731832504, + "num_tokens": 4114745.0, + "reward": -0.22500000149011612, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": -0.22500000149011612, + "rewards/reward_financial_reasoning/std": 0.18708287924528122, + "step": 464, + "step_time": 13.621162455001468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.6875, + "completions/clipped_ratio": 0.125, + "completions/max_length": 140.5, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 46.6875, + "completions/mean_terminated_length": 16.9375, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.20747996438112198, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.16763973236084, + "kl": 1.9613378066569567, + "learning_rate": 4.406234537357744e-06, + "loss": 0.14753587543964386, + "num_tokens": 4139204.0, + "reward": 0.11250001192092896, + "reward_std": 0.44064295291900635, + "rewards/reward_financial_reasoning/mean": 0.11250001192092896, + "rewards/reward_financial_reasoning/std": 0.44064295291900635, + "step": 466, + "step_time": 54.44558242650237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.20837043633125557, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4123859703540802, + "kl": 3.0889033526182175, + "learning_rate": 4.401286491835725e-06, + "loss": 0.04875720292329788, + "num_tokens": 4157948.0, + "reward": 0.20000001043081284, + "reward_std": 0.36730900406837463, + "rewards/reward_financial_reasoning/mean": 0.20000001043081284, + "rewards/reward_financial_reasoning/std": 0.3673090487718582, + "step": 468, + "step_time": 15.994729412002926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.5, + "completions/max_terminated_length": 31.5, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.20926090828138913, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7484197616577148, + "kl": 3.3789178878068924, + "learning_rate": 4.396338446313707e-06, + "loss": 0.12866923213005066, + "num_tokens": 4181444.0, + "reward": -0.07500000484287739, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": -0.07500000484287739, + "rewards/reward_financial_reasoning/std": 0.18708287924528122, + "step": 470, + "step_time": 22.79538112249793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 17.9375, + "completions/mean_terminated_length": 17.9375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.21015138023152272, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1577832698822021, + "kl": 2.395612269639969, + "learning_rate": 4.391390400791688e-06, + "loss": 0.09630811214447021, + "num_tokens": 4198235.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 472, + "step_time": 14.938532446500176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 44.5, + "completions/max_terminated_length": 44.5, + "completions/mean_length": 27.25, + "completions/mean_terminated_length": 27.25, + "completions/min_length": 17.5, + "completions/min_terminated_length": 17.5, + "epoch": 0.21104185218165628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05807355418801308, + "kl": 3.640577018260956, + "learning_rate": 4.386442355269669e-06, + "loss": 0.1443350613117218, + "num_tokens": 4211927.0, + "reward": -0.32500000298023224, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": -0.32500000298023224, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 474, + "step_time": 19.14156578449547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.5, + "completions/max_terminated_length": 33.5, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.21193232413178986, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.008650779724121, + "kl": 2.670556515455246, + "learning_rate": 4.38149430974765e-06, + "loss": 0.10395485162734985, + "num_tokens": 4231151.0, + "reward": -0.07500000298023224, + "reward_std": 0.23106741905212402, + "rewards/reward_financial_reasoning/mean": -0.07500000298023224, + "rewards/reward_financial_reasoning/std": 0.23106742650270462, + "step": 476, + "step_time": 20.392167506000987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 10.125, + "completions/mean_terminated_length": 10.125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.21282279608192342, + "frac_reward_zero_std": 1.0, + "grad_norm": 16.985912322998047, + "kl": 7.489349842071533, + "learning_rate": 4.376546264225631e-06, + "loss": 0.2728106379508972, + "num_tokens": 4245401.0, + "reward": 0.12500000558793545, + "reward_std": 0.34743960946798325, + "rewards/reward_financial_reasoning/mean": 0.12500000558793545, + "rewards/reward_financial_reasoning/std": 0.34743960946798325, + "step": 478, + "step_time": 11.78860704699764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 21.125, + "completions/mean_terminated_length": 21.125, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.21371326803205698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06662454456090927, + "kl": 3.3793341517448425, + "learning_rate": 4.371598218703612e-06, + "loss": 0.12479593604803085, + "num_tokens": 4265107.0, + "reward": 0.17500000447034836, + "reward_std": 0.29398737102746964, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2939873933792114, + "step": 480, + "step_time": 21.45671570050399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.5, + "completions/max_terminated_length": 37.5, + "completions/mean_length": 19.1875, + "completions/mean_terminated_length": 19.1875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.21460373998219057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21638722717761993, + "kl": 4.512425392866135, + "learning_rate": 4.366650173181593e-06, + "loss": 0.12742410600185394, + "num_tokens": 4284854.0, + "reward": 0.22499999776482582, + "reward_std": 0.34743958711624146, + "rewards/reward_financial_reasoning/mean": 0.22499999776482582, + "rewards/reward_financial_reasoning/std": 0.34743958711624146, + "step": 482, + "step_time": 21.60816939650249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 53.3125, + "completions/clipped_ratio": 0.125, + "completions/max_length": 149.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 53.3125, + "completions/mean_terminated_length": 24.479166984558105, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.21549421193232413, + "frac_reward_zero_std": 0.75, + "grad_norm": 15.088199615478516, + "kl": 2.1596968695521355, + "learning_rate": 4.361702127659575e-06, + "loss": 0.06029912829399109, + "num_tokens": 4299651.0, + "reward": 0.01250000111758709, + "reward_std": 0.24438642710447311, + "rewards/reward_financial_reasoning/mean": 0.01250000111758709, + "rewards/reward_financial_reasoning/std": 0.2443864420056343, + "step": 484, + "step_time": 49.9494952740024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.5, + "completions/max_terminated_length": 31.5, + "completions/mean_length": 25.1875, + "completions/mean_terminated_length": 25.1875, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2163846838824577, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7525089979171753, + "kl": 2.5929179787635803, + "learning_rate": 4.356754082137556e-06, + "loss": 0.10190180689096451, + "num_tokens": 4317406.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 486, + "step_time": 18.708130840001104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 12.9375, + "completions/mean_terminated_length": 12.9375, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.21727515583259127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8261609673500061, + "kl": 3.737182319164276, + "learning_rate": 4.351806036615537e-06, + "loss": 0.12806250154972076, + "num_tokens": 4339637.0, + "reward": -0.20000000298023224, + "reward_std": 0.32071349024772644, + "rewards/reward_financial_reasoning/mean": -0.20000000298023224, + "rewards/reward_financial_reasoning/std": 0.32071349024772644, + "step": 488, + "step_time": 19.100060859494988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.5, + "completions/max_terminated_length": 28.5, + "completions/mean_length": 22.5625, + "completions/mean_terminated_length": 22.5625, + "completions/min_length": 16.5, + "completions/min_terminated_length": 16.5, + "epoch": 0.21816562778272486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06733864545822144, + "kl": 3.426992893218994, + "learning_rate": 4.346857991093518e-06, + "loss": 0.1387546956539154, + "num_tokens": 4354870.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 490, + "step_time": 16.014215874009096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 18.4375, + "completions/mean_terminated_length": 18.4375, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.21905609973285842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10745098441839218, + "kl": 12.205712288618088, + "learning_rate": 4.3419099455715e-06, + "loss": 0.3105950951576233, + "num_tokens": 4368573.0, + "reward": -0.12500000186264515, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.12500000186264515, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 492, + "step_time": 13.937267904999317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 149.5, + "completions/max_terminated_length": 29.5, + "completions/mean_length": 35.0, + "completions/mean_terminated_length": 19.785714626312256, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.21994657168299198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07543536275625229, + "kl": 2.3355341032147408, + "learning_rate": 4.336961900049481e-06, + "loss": 0.0785098671913147, + "num_tokens": 4393805.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 494, + "step_time": 57.094905161506176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 15.6875, + "completions/mean_terminated_length": 15.6875, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.22083704363312556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15516425669193268, + "kl": 2.783940374851227, + "learning_rate": 4.332013854527462e-06, + "loss": 0.10796858370304108, + "num_tokens": 4410792.0, + "reward": -0.20000000298023224, + "reward_std": 0.32071349024772644, + "rewards/reward_financial_reasoning/mean": -0.20000000298023224, + "rewards/reward_financial_reasoning/std": 0.32071349024772644, + "step": 496, + "step_time": 17.537851961504202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.5, + "completions/max_terminated_length": 33.5, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.22172751558325912, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.727818965911865, + "kl": 3.4606142938137054, + "learning_rate": 4.327065809005443e-06, + "loss": 0.21134862303733826, + "num_tokens": 4424176.0, + "reward": 0.16250000894069672, + "reward_std": 0.24571321159601212, + "rewards/reward_financial_reasoning/mean": 0.16250000894069672, + "rewards/reward_financial_reasoning/std": 0.2457132264971733, + "step": 498, + "step_time": 17.342474356501043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 131.5, + "completions/max_terminated_length": 12.5, + "completions/mean_length": 25.4375, + "completions/mean_terminated_length": 10.285714626312256, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.2226179875333927, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4800151586532593, + "kl": 5.777998372912407, + "learning_rate": 4.322117763483424e-06, + "loss": 0.3806329369544983, + "num_tokens": 4437359.0, + "reward": -0.012499988079071045, + "reward_std": 0.28327932208776474, + "rewards/reward_financial_reasoning/mean": -0.012499988079071045, + "rewards/reward_financial_reasoning/std": 0.2832793518900871, + "step": 500, + "step_time": 42.75106621949817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 145.5, + "completions/max_terminated_length": 39.0, + "completions/mean_length": 42.4375, + "completions/mean_terminated_length": 28.46428680419922, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.22350845948352627, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.514166831970215, + "kl": 2.001025475561619, + "learning_rate": 4.317169717961406e-06, + "loss": 0.12764599919319153, + "num_tokens": 4455462.0, + "reward": 0.21249999850988388, + "reward_std": 0.3181980475783348, + "rewards/reward_financial_reasoning/mean": 0.21249999850988388, + "rewards/reward_financial_reasoning/std": 0.3181980773806572, + "step": 502, + "step_time": 52.044455729494075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11.5, + "completions/max_terminated_length": 11.5, + "completions/mean_length": 9.5, + "completions/mean_terminated_length": 9.5, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.22439893143365983, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.35060715675354, + "kl": 2.7600596249103546, + "learning_rate": 4.312221672439386e-06, + "loss": 0.1091075986623764, + "num_tokens": 4479406.0, + "reward": -0.20000000298023224, + "reward_std": 0.32071349024772644, + "rewards/reward_financial_reasoning/mean": -0.20000000298023224, + "rewards/reward_financial_reasoning/std": 0.32071349024772644, + "step": 504, + "step_time": 17.23378602399316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.5, + "completions/max_terminated_length": 58.5, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.2252894033837934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17364293336868286, + "kl": 2.8194206953048706, + "learning_rate": 4.307273626917368e-06, + "loss": 0.10312893986701965, + "num_tokens": 4497138.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 506, + "step_time": 25.97717865550294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 142.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 32.875, + "completions/mean_terminated_length": 17.535714626312256, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.22617987533392697, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.222346782684326, + "kl": 3.0017409920692444, + "learning_rate": 4.302325581395349e-06, + "loss": 0.2027941793203354, + "num_tokens": 4521648.0, + "reward": 0.11250000447034836, + "reward_std": 0.31000544875860214, + "rewards/reward_financial_reasoning/mean": 0.11250000447034836, + "rewards/reward_financial_reasoning/std": 0.3100054860115051, + "step": 508, + "step_time": 53.93680636600038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.5, + "completions/max_terminated_length": 28.5, + "completions/mean_length": 17.8125, + "completions/mean_terminated_length": 17.8125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.22707034728406056, + "frac_reward_zero_std": 1.0, + "grad_norm": 2410.09130859375, + "kl": 113.49399599432945, + "learning_rate": 4.297377535873331e-06, + "loss": 3.160386323928833, + "num_tokens": 4534861.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 510, + "step_time": 14.666269088997069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.5, + "completions/max_terminated_length": 27.5, + "completions/mean_length": 21.6875, + "completions/mean_terminated_length": 21.6875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.22796081923419412, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.0670084953308105, + "kl": 3.2094925343990326, + "learning_rate": 4.292429490351312e-06, + "loss": 0.1282799392938614, + "num_tokens": 4552176.0, + "reward": -0.04999999701976776, + "reward_std": 0.37416571378707886, + "rewards/reward_financial_reasoning/mean": -0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.37416571378707886, + "step": 512, + "step_time": 17.098706054504873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.0, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.2288512911843277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08085216581821442, + "kl": 2.5412717759609222, + "learning_rate": 4.287481444829293e-06, + "loss": 0.10157681256532669, + "num_tokens": 4559796.0, + "reward": 0.5750000029802322, + "reward_std": 0.34743958711624146, + "rewards/reward_financial_reasoning/mean": 0.5750000029802322, + "rewards/reward_financial_reasoning/std": 0.34743958711624146, + "step": 514, + "step_time": 10.108604683504382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15.5, + "completions/max_terminated_length": 15.5, + "completions/mean_length": 11.1875, + "completions/mean_terminated_length": 11.1875, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.22974176313446126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29972851276397705, + "kl": 2.5984874963760376, + "learning_rate": 4.282533399307274e-06, + "loss": 0.10182797163724899, + "num_tokens": 4579279.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 516, + "step_time": 15.588283419001527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 20.375, + "completions/mean_terminated_length": 20.375, + "completions/min_length": 18.5, + "completions/min_terminated_length": 18.5, + "epoch": 0.23063223508459482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12788325548171997, + "kl": 2.4726991653442383, + "learning_rate": 4.2775853537852555e-06, + "loss": 0.09916701912879944, + "num_tokens": 4596741.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 518, + "step_time": 15.570501908492588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 10.8125, + "completions/mean_terminated_length": 10.8125, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.2315227070347284, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.225735902786255, + "kl": 4.514407992362976, + "learning_rate": 4.2726373082632364e-06, + "loss": 0.17293544113636017, + "num_tokens": 4616618.0, + "reward": -0.19999999925494194, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": -0.19999999925494194, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 520, + "step_time": 15.201164821501152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.0, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 29.375, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.23241317898486197, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.38097259402275085, + "kl": 2.0567711293697357, + "learning_rate": 4.267689262741217e-06, + "loss": 0.21855676174163818, + "num_tokens": 4637952.0, + "reward": 0.07500001043081284, + "reward_std": 0.3823606073856354, + "rewards/reward_financial_reasoning/mean": 0.07500001043081284, + "rewards/reward_financial_reasoning/std": 0.3823606073856354, + "step": 522, + "step_time": 51.707422207495256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.1875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 143.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 28.1875, + "completions/mean_terminated_length": 12.964285850524902, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.23330365093499555, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.2591474056243896, + "kl": 2.7854665964841843, + "learning_rate": 4.262741217219199e-06, + "loss": 0.2408628761768341, + "num_tokens": 4655307.0, + "reward": 0.0625000074505806, + "reward_std": 0.36345769464969635, + "rewards/reward_financial_reasoning/mean": 0.0625000074505806, + "rewards/reward_financial_reasoning/std": 0.36345772445201874, + "step": 524, + "step_time": 51.00816127199505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 14.125, + "completions/mean_terminated_length": 14.125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2341941228851291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11695566028356552, + "kl": 2.5478688031435013, + "learning_rate": 4.25779317169718e-06, + "loss": 0.09943416714668274, + "num_tokens": 4669117.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 526, + "step_time": 11.958828082002583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.5, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 16.5625, + "completions/mean_terminated_length": 16.5625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.2350845948352627, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1739898920059204, + "kl": 2.163258582353592, + "learning_rate": 4.252845126175161e-06, + "loss": 0.07099819928407669, + "num_tokens": 4691342.0, + "reward": 0.012499995529651642, + "reward_std": 0.46944527328014374, + "rewards/reward_financial_reasoning/mean": 0.012499995529651642, + "rewards/reward_financial_reasoning/std": 0.46944527328014374, + "step": 528, + "step_time": 19.66749314849585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.23597506678539626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1463579386472702, + "kl": 3.313349887728691, + "learning_rate": 4.247897080653142e-06, + "loss": 0.11747953295707703, + "num_tokens": 4713458.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 530, + "step_time": 19.892412299501302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.5, + "completions/max_terminated_length": 36.5, + "completions/mean_length": 22.625, + "completions/mean_terminated_length": 22.625, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.23686553873552982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16379092633724213, + "kl": 2.420012056827545, + "learning_rate": 4.242949035131124e-06, + "loss": 0.0907297134399414, + "num_tokens": 4735012.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 532, + "step_time": 22.5668212334931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.1875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.0, + "completions/max_terminated_length": 33.5, + "completions/mean_length": 33.1875, + "completions/mean_terminated_length": 18.85714340209961, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.2377560106856634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08127009868621826, + "kl": 2.7376275807619095, + "learning_rate": 4.238000989609105e-06, + "loss": 0.0901961699128151, + "num_tokens": 4754439.0, + "reward": -0.12500000186264515, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.12500000186264515, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 534, + "step_time": 49.09290904900263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 120.5, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 120.5, + "completions/mean_terminated_length": 15.300000190734863, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.23864648263579696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09383174031972885, + "kl": 1.5632264073938131, + "learning_rate": 4.233052944087086e-06, + "loss": 0.0377073734998703, + "num_tokens": 4783719.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 536, + "step_time": 91.08600207899872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 11.75, + "completions/mean_terminated_length": 11.75, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.23953695458593055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38002869486808777, + "kl": 3.1995151340961456, + "learning_rate": 4.228104898565067e-06, + "loss": 0.12680239975452423, + "num_tokens": 4806131.0, + "reward": -0.04999999701976776, + "reward_std": 0.37416571378707886, + "rewards/reward_financial_reasoning/mean": -0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.37416571378707886, + "step": 538, + "step_time": 18.143814373001078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 20.4375, + "completions/mean_terminated_length": 20.4375, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.2404274265360641, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13875405490398407, + "kl": 2.7571463584899902, + "learning_rate": 4.2231568530430485e-06, + "loss": 0.11056126654148102, + "num_tokens": 4820570.0, + "reward": 0.17500000447034836, + "reward_std": 0.29398737102746964, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2939873933792114, + "step": 540, + "step_time": 15.814246591500705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 22.25, + "completions/mean_terminated_length": 22.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.2413178984861977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36984992027282715, + "kl": 1.9395490437746048, + "learning_rate": 4.2182088075210294e-06, + "loss": 0.08099240064620972, + "num_tokens": 4834806.0, + "reward": 0.17500000074505806, + "reward_std": 0.29398736357688904, + "rewards/reward_financial_reasoning/mean": 0.17500000074505806, + "rewards/reward_financial_reasoning/std": 0.29398736357688904, + "step": 542, + "step_time": 16.965279011004895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 13.625, + "completions/mean_terminated_length": 13.625, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.24220837043633126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26365983486175537, + "kl": 3.2727459371089935, + "learning_rate": 4.21326076199901e-06, + "loss": 0.13695213198661804, + "num_tokens": 4851616.0, + "reward": 0.025000005960464478, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": 0.025000005960464478, + "rewards/reward_financial_reasoning/std": 0.24053513258695602, + "step": 544, + "step_time": 14.740081926498533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.9375, + "completions/mean_terminated_length": 11.9375, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.24309884238646481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7430917620658875, + "kl": 5.095589101314545, + "learning_rate": 4.208312716476992e-06, + "loss": 0.20536673069000244, + "num_tokens": 4876447.0, + "reward": -0.1999999973922968, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.1999999973922968, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 546, + "step_time": 19.51627654249387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 17.625, + "completions/mean_terminated_length": 17.625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.2439893143365984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12649628520011902, + "kl": 2.3244690746068954, + "learning_rate": 4.203364670954973e-06, + "loss": 0.0855722650885582, + "num_tokens": 4897945.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 548, + "step_time": 22.503613662498537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.25, + "completions/clipped_ratio": 0.125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 14.5, + "completions/mean_length": 42.25, + "completions/mean_terminated_length": 11.71428632736206, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.24487978628673196, + "frac_reward_zero_std": 0.5, + "grad_norm": 8.322593688964844, + "kl": 3.0940473526716232, + "learning_rate": 4.198416625432954e-06, + "loss": 0.2162281721830368, + "num_tokens": 4921997.0, + "reward": 0.17500000912696123, + "reward_std": 0.30916696786880493, + "rewards/reward_financial_reasoning/mean": 0.17500000912696123, + "rewards/reward_financial_reasoning/std": 0.3091669976711273, + "step": 550, + "step_time": 87.72647281950412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 169.5, + "completions/max_terminated_length": 51.0, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 20.767857551574707, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.24577025823686555, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.2092423439025879, + "kl": 2.4467161744832993, + "learning_rate": 4.193468579910935e-06, + "loss": 0.209614560008049, + "num_tokens": 4944489.0, + "reward": 0.21250000968575478, + "reward_std": 0.20310094952583313, + "rewards/reward_financial_reasoning/mean": 0.21250000968575478, + "rewards/reward_financial_reasoning/std": 0.20310097932815552, + "step": 552, + "step_time": 61.421522411004844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 14.0, + "completions/mean_terminated_length": 14.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.2466607301869991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2507273256778717, + "kl": 2.4721511006355286, + "learning_rate": 4.188520534388917e-06, + "loss": 0.09320840239524841, + "num_tokens": 4962209.0, + "reward": 0.30000001192092896, + "reward_std": 0.5345224589109421, + "rewards/reward_financial_reasoning/mean": 0.30000001192092896, + "rewards/reward_financial_reasoning/std": 0.5345224738121033, + "step": 554, + "step_time": 15.359706958995957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.3125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 142.0, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 31.3125, + "completions/mean_terminated_length": 16.053571701049805, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.2475512021371327, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.212193012237549, + "kl": 3.4973824322223663, + "learning_rate": 4.183572488866898e-06, + "loss": 0.16043438017368317, + "num_tokens": 4975462.0, + "reward": 0.4374999888241291, + "reward_std": 0.20310094952583313, + "rewards/reward_financial_reasoning/mean": 0.4374999888241291, + "rewards/reward_financial_reasoning/std": 0.20310096442699432, + "step": 556, + "step_time": 45.51254690750284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.1875, + "completions/clipped_ratio": 0.125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 15.5, + "completions/mean_length": 41.1875, + "completions/mean_terminated_length": 10.500000476837158, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.24844167408726625, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3985747694969177, + "kl": 3.6526686996221542, + "learning_rate": 4.1786244433448796e-06, + "loss": 0.23863635957241058, + "num_tokens": 4997065.0, + "reward": -0.17499999701976776, + "reward_std": 0.32802625745534897, + "rewards/reward_financial_reasoning/mean": -0.17499999701976776, + "rewards/reward_financial_reasoning/std": 0.32802625745534897, + "step": 558, + "step_time": 84.31422862350155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 23.0, + "completions/mean_terminated_length": 23.0, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.2493321460373998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17756827175617218, + "kl": 2.134700432419777, + "learning_rate": 4.17367639782286e-06, + "loss": 0.08119706809520721, + "num_tokens": 5019425.0, + "reward": 0.1250000037252903, + "reward_std": 0.34743958711624146, + "rewards/reward_financial_reasoning/mean": 0.1250000037252903, + "rewards/reward_financial_reasoning/std": 0.34743958711624146, + "step": 560, + "step_time": 21.187141643997165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 64.875, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 147.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 64.875, + "completions/mean_terminated_length": 19.050000190734863, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.25022261798753337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06384090334177017, + "kl": 33.90071538090706, + "learning_rate": 4.1687283523008415e-06, + "loss": 0.38022756576538086, + "num_tokens": 5042127.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 562, + "step_time": 54.73333459249625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 56.875, + "completions/clipped_ratio": 0.125, + "completions/max_length": 154.5, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 56.875, + "completions/mean_terminated_length": 26.166666984558105, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.25111308993766696, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.06289020925760269, + "kl": 2.710405856370926, + "learning_rate": 4.163780306778823e-06, + "loss": 0.245255246758461, + "num_tokens": 5058741.0, + "reward": 0.125, + "reward_std": 0.2314550280570984, + "rewards/reward_financial_reasoning/mean": 0.125, + "rewards/reward_financial_reasoning/std": 0.2314550280570984, + "step": 564, + "step_time": 51.34954524199202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.75, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 144.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 30.75, + "completions/mean_terminated_length": 15.437500476837158, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.25200356188780054, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.2839765548706055, + "kl": 3.1892134696245193, + "learning_rate": 4.158832261256804e-06, + "loss": 0.1574995219707489, + "num_tokens": 5073553.0, + "reward": 0.6375000327825546, + "reward_std": 0.22243820875883102, + "rewards/reward_financial_reasoning/mean": 0.6375000327825546, + "rewards/reward_financial_reasoning/std": 0.22243822365999222, + "step": 566, + "step_time": 46.588499311001215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 71.375, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 71.375, + "completions/mean_terminated_length": 29.416667938232422, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.25289403383793413, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.5852010250091553, + "kl": 1.5079333148896694, + "learning_rate": 4.153884215734785e-06, + "loss": -0.010437268763780594, + "num_tokens": 5094175.0, + "reward": -0.16250000149011612, + "reward_std": 0.3156214952468872, + "rewards/reward_financial_reasoning/mean": -0.16250000149011612, + "rewards/reward_financial_reasoning/std": 0.3156214952468872, + "step": 568, + "step_time": 82.4080735499956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.5, + "completions/max_terminated_length": 36.5, + "completions/mean_length": 16.9375, + "completions/mean_terminated_length": 16.9375, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.25378450578806766, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.807444095611572, + "kl": 3.8660908937454224, + "learning_rate": 4.148936170212766e-06, + "loss": 0.19987696409225464, + "num_tokens": 5113566.0, + "reward": -0.012499988079071045, + "reward_std": 0.28327932208776474, + "rewards/reward_financial_reasoning/mean": -0.012499988079071045, + "rewards/reward_financial_reasoning/std": 0.2832793518900871, + "step": 570, + "step_time": 21.036308422495495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.5, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 141.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 26.5, + "completions/mean_terminated_length": 11.017857551574707, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.25467497773820125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8640673756599426, + "kl": 4.032097265124321, + "learning_rate": 4.143988124690748e-06, + "loss": 0.14955981075763702, + "num_tokens": 5130134.0, + "reward": 0.05000000447034836, + "reward_std": 0.42761798202991486, + "rewards/reward_financial_reasoning/mean": 0.05000000447034836, + "rewards/reward_financial_reasoning/std": 0.42761798202991486, + "step": 572, + "step_time": 47.026896394996584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.5, + "completions/max_terminated_length": 97.5, + "completions/mean_length": 32.25, + "completions/mean_terminated_length": 32.25, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.25556544968833483, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19692854583263397, + "kl": 2.4768466502428055, + "learning_rate": 4.139040079168729e-06, + "loss": 0.07914983481168747, + "num_tokens": 5147642.0, + "reward": 0.025000005960464478, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": 0.025000005960464478, + "rewards/reward_financial_reasoning/std": 0.24053513258695602, + "step": 574, + "step_time": 36.08788107399232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 30.125, + "completions/mean_terminated_length": 30.125, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.25645592163846836, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0939751863479614, + "kl": 2.487314283847809, + "learning_rate": 4.13409203364671e-06, + "loss": 0.09564441442489624, + "num_tokens": 5163948.0, + "reward": 0.17500000074505806, + "reward_std": 0.29398736357688904, + "rewards/reward_financial_reasoning/mean": 0.17500000074505806, + "rewards/reward_financial_reasoning/std": 0.29398736357688904, + "step": 576, + "step_time": 20.96104118149742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 9.6875, + "completions/mean_terminated_length": 9.6875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.25734639358860195, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4339823722839355, + "kl": 4.345266610383987, + "learning_rate": 4.129143988124691e-06, + "loss": 0.15623024106025696, + "num_tokens": 5178567.0, + "reward": 0.5249999910593033, + "reward_std": 0.40089183300733566, + "rewards/reward_financial_reasoning/mean": 0.5249999910593033, + "rewards/reward_financial_reasoning/std": 0.40089183300733566, + "step": 578, + "step_time": 13.346558070505125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 153.0, + "completions/max_terminated_length": 48.5, + "completions/mean_length": 43.875, + "completions/mean_terminated_length": 29.750000953674316, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.25823686553873554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06241405010223389, + "kl": 2.072991468012333, + "learning_rate": 4.1241959426026726e-06, + "loss": 0.07811924070119858, + "num_tokens": 5197021.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 580, + "step_time": 50.97594972850857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.5, + "completions/clipped_ratio": 0.125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 46.5, + "completions/mean_terminated_length": 16.57142925262451, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.2591273374888691, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.204791784286499, + "kl": 3.2653029412031174, + "learning_rate": 4.1192478970806535e-06, + "loss": 0.18918476998806, + "num_tokens": 5216461.0, + "reward": 0.22500000894069672, + "reward_std": 0.4830881953239441, + "rewards/reward_financial_reasoning/mean": 0.22500000894069672, + "rewards/reward_financial_reasoning/std": 0.4830882251262665, + "step": 582, + "step_time": 83.53283959699183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.5, + "completions/max_terminated_length": 31.5, + "completions/mean_length": 22.6875, + "completions/mean_terminated_length": 22.6875, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.26001780943900266, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1705213189125061, + "kl": 2.3984710425138474, + "learning_rate": 4.1142998515586345e-06, + "loss": 0.10159868746995926, + "num_tokens": 5237608.0, + "reward": -0.03750000521540642, + "reward_std": 0.1505940556526184, + "rewards/reward_financial_reasoning/mean": -0.03750000521540642, + "rewards/reward_financial_reasoning/std": 0.1505940705537796, + "step": 584, + "step_time": 20.515003172498837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 68.6875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 134.0, + "completions/max_terminated_length": 8.5, + "completions/mean_length": 68.6875, + "completions/mean_terminated_length": 5.9375, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.26090828138913624, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3467575311660767, + "kl": 4.0492381900548935, + "learning_rate": 4.109351806036616e-06, + "loss": 0.15455511212348938, + "num_tokens": 5252235.0, + "reward": 0.22500000894069672, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.22500000894069672, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 586, + "step_time": 42.47935264099942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 14.4375, + "completions/mean_terminated_length": 14.4375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.26179875333926983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4897940158843994, + "kl": 2.533974751830101, + "learning_rate": 4.104403760514597e-06, + "loss": 0.10408926755189896, + "num_tokens": 5272538.0, + "reward": 0.17500000447034836, + "reward_std": 0.29398737102746964, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2939873933792114, + "step": 588, + "step_time": 16.557259472498117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 74.625, + "completions/clipped_ratio": 0.25, + "completions/max_length": 141.5, + "completions/max_terminated_length": 14.5, + "completions/mean_length": 74.625, + "completions/mean_terminated_length": 11.125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.26268922528940336, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.31131821870803833, + "kl": 4.341089241206646, + "learning_rate": 4.099455714992578e-06, + "loss": 0.05962072312831879, + "num_tokens": 5287316.0, + "reward": 0.4374999888241291, + "reward_std": 0.20310094952583313, + "rewards/reward_financial_reasoning/mean": 0.4374999888241291, + "rewards/reward_financial_reasoning/std": 0.20310096442699432, + "step": 590, + "step_time": 44.81954843049607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 77.6875, + "completions/clipped_ratio": 0.25, + "completions/max_length": 146.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 77.6875, + "completions/mean_terminated_length": 15.1875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.26357969723953695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08828865736722946, + "kl": 2.6844928599894047, + "learning_rate": 4.094507669470559e-06, + "loss": 0.08801240473985672, + "num_tokens": 5299063.0, + "reward": 0.3500000014901161, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": 0.3500000014901161, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 592, + "step_time": 44.574043756492756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.375, + "completions/clipped_ratio": 0.125, + "completions/max_length": 139.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 46.375, + "completions/mean_terminated_length": 16.58333396911621, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.26447016918967053, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9881147146224976, + "kl": 2.950136587023735, + "learning_rate": 4.089559623948541e-06, + "loss": 0.26590752601623535, + "num_tokens": 5319749.0, + "reward": 0.15000000223517418, + "reward_std": 0.1963960975408554, + "rewards/reward_financial_reasoning/mean": 0.15000000223517418, + "rewards/reward_financial_reasoning/std": 0.1963961124420166, + "step": 594, + "step_time": 49.72403268000926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 13.75, + "completions/mean_terminated_length": 13.75, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.2653606411398041, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.235046863555908, + "kl": 2.279493637382984, + "learning_rate": 4.084611578426522e-06, + "loss": 0.08282871544361115, + "num_tokens": 5335481.0, + "reward": 0.1250000037252903, + "reward_std": 0.5438356846570969, + "rewards/reward_financial_reasoning/mean": 0.1250000037252903, + "rewards/reward_financial_reasoning/std": 0.5438356846570969, + "step": 596, + "step_time": 14.50181246100692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 135.0, + "completions/max_terminated_length": 30.5, + "completions/mean_length": 37.375, + "completions/mean_terminated_length": 23.491071701049805, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.26625111308993765, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7327852249145508, + "kl": 2.17822552472353, + "learning_rate": 4.079663532904503e-06, + "loss": 0.14887002110481262, + "num_tokens": 5351039.0, + "reward": 0.0625000074505806, + "reward_std": 0.36345769464969635, + "rewards/reward_financial_reasoning/mean": 0.0625000074505806, + "rewards/reward_financial_reasoning/std": 0.36345772445201874, + "step": 598, + "step_time": 43.72957969250274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.5, + "completions/max_terminated_length": 35.5, + "completions/mean_length": 24.625, + "completions/mean_terminated_length": 24.625, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.26714158504007124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.892500102519989, + "kl": 2.6852394491434097, + "learning_rate": 4.074715487382484e-06, + "loss": 0.10773130506277084, + "num_tokens": 5371433.0, + "reward": 0.07499999552965164, + "reward_std": 0.40089186280965805, + "rewards/reward_financial_reasoning/mean": 0.07499999552965164, + "rewards/reward_financial_reasoning/std": 0.40089187026023865, + "step": 600, + "step_time": 21.074333743999887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 38.0, + "completions/max_terminated_length": 38.0, + "completions/mean_length": 15.1875, + "completions/mean_terminated_length": 15.1875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.2680320569902048, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12110089510679245, + "kl": 5.3096261620521545, + "learning_rate": 4.0697674418604655e-06, + "loss": 0.17482957243919373, + "num_tokens": 5382116.0, + "reward": 0.424999987706542, + "reward_std": 0.13363061845302582, + "rewards/reward_financial_reasoning/mean": 0.424999987706542, + "rewards/reward_financial_reasoning/std": 0.13363061845302582, + "step": 602, + "step_time": 15.877465633999236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 18.5, + "completions/min_terminated_length": 18.5, + "epoch": 0.26892252894033836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11750569939613342, + "kl": 2.0825249701738358, + "learning_rate": 4.0648193963384465e-06, + "loss": 0.0821978822350502, + "num_tokens": 5400944.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 604, + "step_time": 16.235258598000655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 16.5, + "completions/mean_terminated_length": 16.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.26981300089047194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39543071389198303, + "kl": 2.9878444522619247, + "learning_rate": 4.0598713508164274e-06, + "loss": 0.12423422187566757, + "num_tokens": 5417336.0, + "reward": 0.17500000447034836, + "reward_std": 0.29398737102746964, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2939873933792114, + "step": 606, + "step_time": 14.500998946492473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 45.625, + "completions/clipped_ratio": 0.125, + "completions/max_length": 138.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 45.625, + "completions/mean_terminated_length": 15.708333969116211, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.27070347284060553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09268730878829956, + "kl": 3.4857796132564545, + "learning_rate": 4.054923305294409e-06, + "loss": 0.0689154639840126, + "num_tokens": 5432242.0, + "reward": 0.4999999888241291, + "reward_std": 0.05345224589109421, + "rewards/reward_financial_reasoning/mean": 0.4999999888241291, + "rewards/reward_financial_reasoning/std": 0.05345224589109421, + "step": 608, + "step_time": 44.237581209497876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.1875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 158.5, + "completions/max_terminated_length": 42.5, + "completions/mean_length": 41.1875, + "completions/mean_terminated_length": 26.25892925262451, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.2715939447907391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2548654079437256, + "kl": 1.9635280668735504, + "learning_rate": 4.04997525977239e-06, + "loss": 0.0742587149143219, + "num_tokens": 5449301.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 610, + "step_time": 53.4047240209984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 15.3125, + "completions/mean_terminated_length": 15.3125, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.27248441674087265, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.25754010677337646, + "kl": 2.283314034342766, + "learning_rate": 4.045027214250372e-06, + "loss": 0.10037024319171906, + "num_tokens": 5471570.0, + "reward": 0.2500000074505806, + "reward_std": 0.2577935457229614, + "rewards/reward_financial_reasoning/mean": 0.2500000074505806, + "rewards/reward_financial_reasoning/std": 0.2577935680747032, + "step": 612, + "step_time": 17.858794250994833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 28.0625, + "completions/mean_terminated_length": 28.0625, + "completions/min_length": 18.5, + "completions/min_terminated_length": 18.5, + "epoch": 0.27337488869100623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0666544958949089, + "kl": 2.3905524760484695, + "learning_rate": 4.040079168728352e-06, + "loss": 0.09394139051437378, + "num_tokens": 5489603.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 614, + "step_time": 23.732076187996427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.5, + "completions/max_terminated_length": 35.5, + "completions/mean_length": 22.0625, + "completions/mean_terminated_length": 22.0625, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.2742653606411398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4475157558917999, + "kl": 2.87551811337471, + "learning_rate": 4.035131123206334e-06, + "loss": 0.10836954414844513, + "num_tokens": 5506588.0, + "reward": 0.17500000074505806, + "reward_std": 0.29398736357688904, + "rewards/reward_financial_reasoning/mean": 0.17500000074505806, + "rewards/reward_financial_reasoning/std": 0.29398736357688904, + "step": 616, + "step_time": 18.957288730998698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 20.1875, + "completions/mean_terminated_length": 20.1875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.27515583259127335, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8842321634292603, + "kl": 2.9271684885025024, + "learning_rate": 4.030183077684315e-06, + "loss": 0.11716306954622269, + "num_tokens": 5526535.0, + "reward": 0.21250000968575478, + "reward_std": 0.20310094952583313, + "rewards/reward_financial_reasoning/mean": 0.21250000968575478, + "rewards/reward_financial_reasoning/std": 0.20310097932815552, + "step": 618, + "step_time": 17.811359782997897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.0, + "completions/max_terminated_length": 37.0, + "completions/mean_length": 33.5, + "completions/mean_terminated_length": 33.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.27604630454140694, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.110678791999817, + "kl": 2.3337113112211227, + "learning_rate": 4.025235032162297e-06, + "loss": 0.0931580662727356, + "num_tokens": 5551335.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 620, + "step_time": 24.730726426994806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.9375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 135.5, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 23.9375, + "completions/mean_terminated_length": 8.321428775787354, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.2769367764915405, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3603140413761139, + "kl": 2.8234359323978424, + "learning_rate": 4.020286986640278e-06, + "loss": 0.06738406419754028, + "num_tokens": 5560406.0, + "reward": 0.4125000089406967, + "reward_std": 0.5242162793874741, + "rewards/reward_financial_reasoning/mean": 0.4125000089406967, + "rewards/reward_financial_reasoning/std": 0.5242162793874741, + "step": 622, + "step_time": 40.600682643504115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.0, + "completions/max_terminated_length": 16.5, + "completions/mean_length": 28.0, + "completions/mean_terminated_length": 12.678571701049805, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.2778272484416741, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.575019836425781, + "kl": 2.758111670613289, + "learning_rate": 4.0153389411182585e-06, + "loss": 0.10476522147655487, + "num_tokens": 5579686.0, + "reward": -0.0875000013038516, + "reward_std": 0.266422763466835, + "rewards/reward_financial_reasoning/mean": -0.0875000013038516, + "rewards/reward_financial_reasoning/std": 0.266422763466835, + "step": 624, + "step_time": 48.948475874003634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 14.375, + "completions/mean_terminated_length": 14.375, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.27871772039180764, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.24415989220142365, + "kl": 3.251735508441925, + "learning_rate": 4.01039089559624e-06, + "loss": 0.10590354353189468, + "num_tokens": 5600260.0, + "reward": 0.11250000447034836, + "reward_std": 0.511647641658783, + "rewards/reward_financial_reasoning/mean": 0.11250000447034836, + "rewards/reward_financial_reasoning/std": 0.5116476565599442, + "step": 626, + "step_time": 17.442232475510536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.75, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 137.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 29.75, + "completions/mean_terminated_length": 14.464285850524902, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.27960819234194123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1346379518508911, + "kl": 2.713219091296196, + "learning_rate": 4.005442850074221e-06, + "loss": 0.08727573603391647, + "num_tokens": 5614192.0, + "reward": 0.4999999888241291, + "reward_std": 0.05345224589109421, + "rewards/reward_financial_reasoning/mean": 0.4999999888241291, + "rewards/reward_financial_reasoning/std": 0.05345224589109421, + "step": 628, + "step_time": 44.28616671049895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.5, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 132.5, + "completions/max_terminated_length": 45.0, + "completions/mean_length": 27.5, + "completions/mean_terminated_length": 12.625, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.2804986642920748, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4055415093898773, + "kl": 4.209835857152939, + "learning_rate": 4.000494804552202e-06, + "loss": 0.17613010108470917, + "num_tokens": 5626128.0, + "reward": 0.3999999910593033, + "reward_std": 0.6133611500263214, + "rewards/reward_financial_reasoning/mean": 0.3999999910593033, + "rewards/reward_financial_reasoning/std": 0.6133611798286438, + "step": 630, + "step_time": 41.569186088505376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.25, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 139.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 29.25, + "completions/mean_terminated_length": 14.196428775787354, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.28138913624220835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3633623719215393, + "kl": 2.2347910553216934, + "learning_rate": 3.995546759030183e-06, + "loss": 0.07843896746635437, + "num_tokens": 5644300.0, + "reward": 0.1250000074505806, + "reward_std": 0.34743961691856384, + "rewards/reward_financial_reasoning/mean": 0.1250000074505806, + "rewards/reward_financial_reasoning/std": 0.34743963181972504, + "step": 632, + "step_time": 49.13458822750181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.5, + "completions/max_terminated_length": 17.5, + "completions/mean_length": 12.8125, + "completions/mean_terminated_length": 12.8125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.28227960819234194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4399714767932892, + "kl": 2.800263747572899, + "learning_rate": 3.990598713508165e-06, + "loss": 0.11733436584472656, + "num_tokens": 5663081.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 634, + "step_time": 15.632818516496627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 50.5, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 171.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 50.5, + "completions/mean_terminated_length": 36.125000953674316, + "completions/min_length": 20.5, + "completions/min_terminated_length": 20.5, + "epoch": 0.2831700801424755, + "frac_reward_zero_std": 0.5, + "grad_norm": 10.8825044631958, + "kl": 2.876441642642021, + "learning_rate": 3.985650667986146e-06, + "loss": 0.23618704080581665, + "num_tokens": 5688585.0, + "reward": 0.012499998323619366, + "reward_std": 0.19864802062511444, + "rewards/reward_financial_reasoning/mean": 0.012499998323619366, + "rewards/reward_financial_reasoning/std": 0.19864803552627563, + "step": 636, + "step_time": 62.42338538400145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 55.25, + "completions/clipped_ratio": 0.125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 57.5, + "completions/mean_length": 55.25, + "completions/mean_terminated_length": 26.57142972946167, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.2840605520926091, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.063951849937439, + "kl": 2.164351548999548, + "learning_rate": 3.980702622464127e-06, + "loss": 0.2528214454650879, + "num_tokens": 5705493.0, + "reward": 0.32500001788139343, + "reward_std": 0.40620189905166626, + "rewards/reward_financial_reasoning/mean": 0.32500001788139343, + "rewards/reward_financial_reasoning/std": 0.40620195865631104, + "step": 638, + "step_time": 79.79323933250271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 134.0, + "completions/max_terminated_length": 35.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 17.035715103149414, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.28495102404274264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08490260690450668, + "kl": 2.701087385416031, + "learning_rate": 3.975754576942108e-06, + "loss": 0.08981408923864365, + "num_tokens": 5724549.0, + "reward": -0.12500000186264515, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.12500000186264515, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 640, + "step_time": 48.10033849150204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.5, + "completions/max_terminated_length": 40.5, + "completions/mean_length": 31.5625, + "completions/mean_terminated_length": 16.678571701049805, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.2858414959928762, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.19996166229248, + "kl": 3.914921998977661, + "learning_rate": 3.97080653142009e-06, + "loss": 0.10081885010004044, + "num_tokens": 5743766.0, + "reward": 0.38750001788139343, + "reward_std": 0.39018382132053375, + "rewards/reward_financial_reasoning/mean": 0.38750001788139343, + "rewards/reward_financial_reasoning/std": 0.39018386602401733, + "step": 642, + "step_time": 47.918799355502415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.5, + "completions/max_terminated_length": 37.5, + "completions/mean_length": 28.3125, + "completions/mean_terminated_length": 28.3125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.2867319679430098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07037749886512756, + "kl": 2.5467402040958405, + "learning_rate": 3.9658584858980706e-06, + "loss": 0.10022091865539551, + "num_tokens": 5764507.0, + "reward": -0.10000000521540642, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": -0.10000000521540642, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 644, + "step_time": 21.87775604199851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.5, + "completions/max_terminated_length": 57.5, + "completions/mean_length": 22.3125, + "completions/mean_terminated_length": 22.3125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.28762243989314334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3838065266609192, + "kl": 34.147406324744225, + "learning_rate": 3.9609104403760515e-06, + "loss": 1.4865283966064453, + "num_tokens": 5780000.0, + "reward": -0.025000005960464478, + "reward_std": 0.29398736357688904, + "rewards/reward_financial_reasoning/mean": -0.025000005960464478, + "rewards/reward_financial_reasoning/std": 0.29398736357688904, + "step": 646, + "step_time": 23.692439714999637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.5, + "completions/max_terminated_length": 34.5, + "completions/mean_length": 13.5625, + "completions/mean_terminated_length": 13.5625, + "completions/min_length": 4.5, + "completions/min_terminated_length": 4.5, + "epoch": 0.28851291184327693, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.09616874158382416, + "kl": 4.008978515863419, + "learning_rate": 3.955962394854033e-06, + "loss": 0.13894228637218475, + "num_tokens": 5797961.0, + "reward": -0.14999999850988388, + "reward_std": 0.4457136541604996, + "rewards/reward_financial_reasoning/mean": -0.14999999850988388, + "rewards/reward_financial_reasoning/std": 0.4457136541604996, + "step": 648, + "step_time": 19.492217393992178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 33.5, + "completions/max_terminated_length": 33.5, + "completions/mean_length": 14.5, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.2894033837934105, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.10026784986257553, + "kl": 2.471968561410904, + "learning_rate": 3.951014349332014e-06, + "loss": 0.04348517209291458, + "num_tokens": 5812921.0, + "reward": -0.0624999962747097, + "reward_std": 0.219983771443367, + "rewards/reward_financial_reasoning/mean": -0.0624999962747097, + "rewards/reward_financial_reasoning/std": 0.219983771443367, + "step": 650, + "step_time": 17.18861940100396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.5, + "completions/max_terminated_length": 36.5, + "completions/mean_length": 34.125, + "completions/mean_terminated_length": 19.44642925262451, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.2902938557435441, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.763707160949707, + "kl": 250.4700199663639, + "learning_rate": 3.946066303809995e-06, + "loss": 15.857925415039062, + "num_tokens": 5833435.0, + "reward": 0.17500000912696123, + "reward_std": 0.30916696786880493, + "rewards/reward_financial_reasoning/mean": 0.17500000912696123, + "rewards/reward_financial_reasoning/std": 0.3091669976711273, + "step": 652, + "step_time": 50.153412438994565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 60.4375, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 60.4375, + "completions/mean_terminated_length": 15.488095760345459, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.29118432769367764, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.0384180545806885, + "kl": 1.534380428493023, + "learning_rate": 3.941118258287976e-06, + "loss": 0.04604524374008179, + "num_tokens": 5850522.0, + "reward": 0.21250001154839993, + "reward_std": 0.3419739603996277, + "rewards/reward_financial_reasoning/mean": 0.21250001154839993, + "rewards/reward_financial_reasoning/std": 0.3419739902019501, + "step": 654, + "step_time": 79.99165332850316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.5, + "completions/max_terminated_length": 46.5, + "completions/mean_length": 22.5625, + "completions/mean_terminated_length": 22.5625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2920747996438112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6213010549545288, + "kl": 3.3993609100580215, + "learning_rate": 3.936170212765958e-06, + "loss": 0.12003728747367859, + "num_tokens": 5867427.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 656, + "step_time": 22.32193847149756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 28.0625, + "completions/mean_terminated_length": 28.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.2929652715939448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.232942596077919, + "kl": 2.3799022883176804, + "learning_rate": 3.931222167243939e-06, + "loss": 0.0874049961566925, + "num_tokens": 5885148.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 658, + "step_time": 23.394534400998964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 15.4375, + "completions/mean_terminated_length": 15.4375, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.29385574354407834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328476071357727, + "kl": 2.801008328795433, + "learning_rate": 3.92627412172192e-06, + "loss": 0.10386461019515991, + "num_tokens": 5904787.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 660, + "step_time": 16.91145214050266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 11.3125, + "completions/mean_terminated_length": 11.3125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.2947462154942119, + "frac_reward_zero_std": 1.0, + "grad_norm": 7.384799957275391, + "kl": 4.985250949859619, + "learning_rate": 3.921326076199901e-06, + "loss": 0.18682938814163208, + "num_tokens": 5917672.0, + "reward": 0.6249999850988388, + "reward_std": 0.40089183300733566, + "rewards/reward_financial_reasoning/mean": 0.6249999850988388, + "rewards/reward_financial_reasoning/std": 0.40089183300733566, + "step": 662, + "step_time": 12.575487746493309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.0625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 149.0, + "completions/max_terminated_length": 31.5, + "completions/mean_length": 35.0625, + "completions/mean_terminated_length": 20.267857551574707, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.2956366874443455, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0087742805480957, + "kl": 2.6491604149341583, + "learning_rate": 3.916378030677883e-06, + "loss": 0.21765044331550598, + "num_tokens": 5940305.0, + "reward": -0.03750000428408384, + "reward_std": 0.21297051757574081, + "rewards/reward_financial_reasoning/mean": -0.03750000428408384, + "rewards/reward_financial_reasoning/std": 0.2129705250263214, + "step": 664, + "step_time": 54.70537975649859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 161.0, + "completions/max_terminated_length": 56.5, + "completions/mean_length": 40.4375, + "completions/mean_terminated_length": 25.803571701049805, + "completions/min_length": 16.5, + "completions/min_terminated_length": 16.5, + "epoch": 0.2965271593944791, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.06418345123529434, + "kl": 3.329156816005707, + "learning_rate": 3.911429985155864e-06, + "loss": 0.1880941092967987, + "num_tokens": 5957008.0, + "reward": 0.06250000093132257, + "reward_std": 0.1060660183429718, + "rewards/reward_financial_reasoning/mean": 0.06250000093132257, + "rewards/reward_financial_reasoning/std": 0.1060660183429718, + "step": 666, + "step_time": 53.807816513504804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.29741763134461263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1388830840587616, + "kl": 2.4952100068330765, + "learning_rate": 3.9064819396338445e-06, + "loss": 0.09984306991100311, + "num_tokens": 5979428.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 668, + "step_time": 19.566764060000423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.5, + "completions/max_terminated_length": 18.5, + "completions/mean_length": 12.5, + "completions/mean_terminated_length": 12.5, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.2983081032947462, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17398419976234436, + "kl": 3.8046407103538513, + "learning_rate": 3.901533894111826e-06, + "loss": 0.14238014817237854, + "num_tokens": 5994700.0, + "reward": 0.02499999850988388, + "reward_std": 0.45434410870075226, + "rewards/reward_financial_reasoning/mean": 0.02499999850988388, + "rewards/reward_financial_reasoning/std": 0.45434410870075226, + "step": 670, + "step_time": 13.631318826504867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 39.9375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 162.5, + "completions/max_terminated_length": 46.5, + "completions/mean_length": 39.9375, + "completions/mean_terminated_length": 25.142857551574707, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.2991985752448798, + "frac_reward_zero_std": 0.75, + "grad_norm": 20.34697723388672, + "kl": 7.88471856713295, + "learning_rate": 3.896585848589807e-06, + "loss": 0.20272746682167053, + "num_tokens": 6014331.0, + "reward": -1.862645149230957e-09, + "reward_std": 0.1508890464901924, + "rewards/reward_financial_reasoning/mean": -1.862645149230957e-09, + "rewards/reward_financial_reasoning/std": 0.150889053940773, + "step": 672, + "step_time": 57.11428980049823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 65.875, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 139.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 65.875, + "completions/mean_terminated_length": 25.225000381469727, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.30008904719501334, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6960481405258179, + "kl": 3.615657825022936, + "learning_rate": 3.891637803067789e-06, + "loss": 0.07131641358137131, + "num_tokens": 6026945.0, + "reward": 0.4624999910593033, + "reward_std": 0.6227896511554718, + "rewards/reward_financial_reasoning/mean": 0.4624999910593033, + "rewards/reward_financial_reasoning/std": 0.6227896213531494, + "step": 674, + "step_time": 43.584765961499215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.0, + "completions/max_terminated_length": 48.0, + "completions/mean_length": 25.75, + "completions/mean_terminated_length": 25.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.3009795191451469, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.05800027772784233, + "kl": 2.021700158715248, + "learning_rate": 3.88668975754577e-06, + "loss": 0.04579779505729675, + "num_tokens": 6042589.0, + "reward": 0.38750001788139343, + "reward_std": 0.39018382132053375, + "rewards/reward_financial_reasoning/mean": 0.38750001788139343, + "rewards/reward_financial_reasoning/std": 0.39018386602401733, + "step": 676, + "step_time": 21.839032033509284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 13.6875, + "completions/mean_terminated_length": 13.6875, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.3018699910952805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.451972097158432, + "kl": 2.8006647378206253, + "learning_rate": 3.881741712023751e-06, + "loss": 0.09243463724851608, + "num_tokens": 6059048.0, + "reward": 0.45000001788139343, + "reward_std": 0.37416574358940125, + "rewards/reward_financial_reasoning/mean": 0.45000001788139343, + "rewards/reward_financial_reasoning/std": 0.37416577339172363, + "step": 678, + "step_time": 15.941890390007757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.5, + "completions/max_terminated_length": 93.5, + "completions/mean_length": 22.125, + "completions/mean_terminated_length": 22.125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3027604630454141, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.601978778839111, + "kl": 3.8233794271945953, + "learning_rate": 3.876793666501732e-06, + "loss": 0.1953509896993637, + "num_tokens": 6070754.0, + "reward": 0.08750000596046448, + "reward_std": 0.25599944591522217, + "rewards/reward_financial_reasoning/mean": 0.08750000596046448, + "rewards/reward_financial_reasoning/std": 0.25599944591522217, + "step": 680, + "step_time": 31.333621428992046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.5, + "completions/max_terminated_length": 40.5, + "completions/mean_length": 28.1875, + "completions/mean_terminated_length": 28.1875, + "completions/min_length": 20.5, + "completions/min_terminated_length": 20.5, + "epoch": 0.30365093499554763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2807849049568176, + "kl": 2.814168304204941, + "learning_rate": 3.871845620979714e-06, + "loss": 0.11746909469366074, + "num_tokens": 6088965.0, + "reward": 0.07500000298023224, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.07500000298023224, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 682, + "step_time": 21.245715715005645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 19.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 15.5, + "completions/mean_terminated_length": 15.5, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.3045414069456812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09957019239664078, + "kl": 3.1638737618923187, + "learning_rate": 3.866897575457695e-06, + "loss": 0.11611393094062805, + "num_tokens": 6105413.0, + "reward": 0.17500000447034836, + "reward_std": 0.29398737102746964, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2939873933792114, + "step": 684, + "step_time": 14.758325781003805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 33.75, + "completions/mean_terminated_length": 33.75, + "completions/min_length": 17.5, + "completions/min_terminated_length": 17.5, + "epoch": 0.3054318788958148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048933424055576324, + "kl": 2.0513499826192856, + "learning_rate": 3.861949529935676e-06, + "loss": 0.0797392949461937, + "num_tokens": 6122497.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 686, + "step_time": 23.42249280249962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.0, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 19.6875, + "completions/mean_terminated_length": 19.6875, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.30632235084594833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07173699885606766, + "kl": 2.6596494019031525, + "learning_rate": 3.857001484413657e-06, + "loss": 0.1044078841805458, + "num_tokens": 6135284.0, + "reward": 0.2750000059604645, + "reward_std": 0.5612486004829407, + "rewards/reward_financial_reasoning/mean": 0.2750000059604645, + "rewards/reward_financial_reasoning/std": 0.5612486004829407, + "step": 688, + "step_time": 13.520752940003149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.0, + "completions/max_terminated_length": 31.0, + "completions/mean_length": 24.3125, + "completions/mean_terminated_length": 24.3125, + "completions/min_length": 16.5, + "completions/min_terminated_length": 16.5, + "epoch": 0.3072128227960819, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16115054488182068, + "kl": 3.1704325079917908, + "learning_rate": 3.852053438891638e-06, + "loss": 0.0991721972823143, + "num_tokens": 6152249.0, + "reward": -0.01249999925494194, + "reward_std": 0.12464234232902527, + "rewards/reward_financial_reasoning/mean": -0.01249999925494194, + "rewards/reward_financial_reasoning/std": 0.12464234232902527, + "step": 690, + "step_time": 17.69266291499298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 55.9375, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 55.9375, + "completions/mean_terminated_length": 9.869048118591309, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.3081032947462155, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.471900463104248, + "kl": 2.3048749417066574, + "learning_rate": 3.847105393369619e-06, + "loss": -0.026355061680078506, + "num_tokens": 6168208.0, + "reward": 0.11250000447034836, + "reward_std": 0.511647641658783, + "rewards/reward_financial_reasoning/mean": 0.11250000447034836, + "rewards/reward_financial_reasoning/std": 0.5116476565599442, + "step": 692, + "step_time": 78.48041973699947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.6875, + "completions/clipped_ratio": 0.125, + "completions/max_length": 174.5, + "completions/max_terminated_length": 50.5, + "completions/mean_length": 48.6875, + "completions/mean_terminated_length": 17.4375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.3089937666963491, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3391161859035492, + "kl": 2.7371310964226723, + "learning_rate": 3.8421573478476e-06, + "loss": 0.17964963614940643, + "num_tokens": 6181139.0, + "reward": 0.2500000074505806, + "reward_std": 0.46365733444690704, + "rewards/reward_financial_reasoning/mean": 0.2500000074505806, + "rewards/reward_financial_reasoning/std": 0.46365734934806824, + "step": 694, + "step_time": 54.42009541150037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 133.0, + "completions/max_terminated_length": 8.5, + "completions/mean_length": 23.375, + "completions/mean_terminated_length": 7.794642925262451, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.3098842386464826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5152590274810791, + "kl": 2.381638616323471, + "learning_rate": 3.837209302325582e-06, + "loss": 0.09065329283475876, + "num_tokens": 6196249.0, + "reward": -0.02499999850988388, + "reward_std": 0.5077963322401047, + "rewards/reward_financial_reasoning/mean": -0.02499999850988388, + "rewards/reward_financial_reasoning/std": 0.5077963322401047, + "step": 696, + "step_time": 46.216524112001935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.5, + "completions/max_terminated_length": 58.5, + "completions/mean_length": 25.0, + "completions/mean_terminated_length": 25.0, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.3107747105966162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3281242847442627, + "kl": 2.509884476661682, + "learning_rate": 3.832261256803563e-06, + "loss": 0.09444789588451385, + "num_tokens": 6215585.0, + "reward": -0.30000000447034836, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -0.30000000447034836, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 698, + "step_time": 27.547750868001458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 43.0, + "completions/max_terminated_length": 43.0, + "completions/mean_length": 25.1875, + "completions/mean_terminated_length": 25.1875, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.3116651825467498, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1636444628238678, + "kl": 2.3138733953237534, + "learning_rate": 3.827313211281544e-06, + "loss": 0.07426071166992188, + "num_tokens": 6238084.0, + "reward": 0.17500000447034836, + "reward_std": 0.29398737102746964, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2939873933792114, + "step": 700, + "step_time": 25.016320683502272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 50.5, + "completions/max_terminated_length": 50.5, + "completions/mean_length": 22.3125, + "completions/mean_terminated_length": 22.3125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.31255565449688333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051183633506298065, + "kl": 2.2624500691890717, + "learning_rate": 3.822365165759525e-06, + "loss": 0.08230020105838776, + "num_tokens": 6255713.0, + "reward": 0.17500000447034836, + "reward_std": 0.29398737102746964, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2939873933792114, + "step": 702, + "step_time": 23.571445789493737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.0, + "completions/max_terminated_length": 46.0, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.3134461264470169, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08755451440811157, + "kl": 2.5206730663776398, + "learning_rate": 3.817417120237507e-06, + "loss": 0.19678334891796112, + "num_tokens": 6277597.0, + "reward": 0.32500001043081284, + "reward_std": 0.38347896933555603, + "rewards/reward_financial_reasoning/mean": 0.32500001043081284, + "rewards/reward_financial_reasoning/std": 0.3834789991378784, + "step": 704, + "step_time": 25.50389101500332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 75.125, + "completions/clipped_ratio": 0.25, + "completions/max_length": 143.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 75.125, + "completions/mean_terminated_length": 14.625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.3143365983971505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20571209490299225, + "kl": 2.407316707074642, + "learning_rate": 3.8124690747154876e-06, + "loss": 0.09111398458480835, + "num_tokens": 6293983.0, + "reward": -0.15000000037252903, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -0.15000000037252903, + "rewards/reward_financial_reasoning/std": 0.10690449923276901, + "step": 706, + "step_time": 47.3297834570003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 21.0, + "completions/mean_terminated_length": 21.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.31522707034728403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12484844774007797, + "kl": 3.3755542635917664, + "learning_rate": 3.807521029193469e-06, + "loss": 0.13404133915901184, + "num_tokens": 6309839.0, + "reward": -0.10000000521540642, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": -0.10000000521540642, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 708, + "step_time": 16.799556064499484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 14.8125, + "completions/mean_terminated_length": 14.8125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.3161175422974176, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.978635787963867, + "kl": 3.1937762200832367, + "learning_rate": 3.8025729836714504e-06, + "loss": 0.08270812034606934, + "num_tokens": 6327140.0, + "reward": 0.23750001192092896, + "reward_std": 0.5119454711675644, + "rewards/reward_financial_reasoning/mean": 0.23750001192092896, + "rewards/reward_financial_reasoning/std": 0.5119454860687256, + "step": 710, + "step_time": 16.252217650497187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 74.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 74.0, + "completions/mean_terminated_length": 12.285715103149414, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.3170080142475512, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.725724697113037, + "kl": 3.6923981085419655, + "learning_rate": 3.7976249381494313e-06, + "loss": 0.07456135749816895, + "num_tokens": 6344436.0, + "reward": -0.17499999701976776, + "reward_std": 0.3752485066652298, + "rewards/reward_financial_reasoning/mean": -0.17499999701976776, + "rewards/reward_financial_reasoning/std": 0.375248521566391, + "step": 712, + "step_time": 80.90963733000171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 88.0625, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 15.5, + "completions/mean_length": 88.0625, + "completions/mean_terminated_length": 12.43333387374878, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.3178984861976848, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1844349503517151, + "kl": 2.4319792985916138, + "learning_rate": 3.7926768926274127e-06, + "loss": 0.11355370283126831, + "num_tokens": 6365565.0, + "reward": -0.07499999739229679, + "reward_std": 0.36553528159856796, + "rewards/reward_financial_reasoning/mean": -0.07499999739229679, + "rewards/reward_financial_reasoning/std": 0.36553528159856796, + "step": 714, + "step_time": 83.99197188099424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 44.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.5, + "completions/max_terminated_length": 100.5, + "completions/mean_length": 44.5625, + "completions/mean_terminated_length": 44.5625, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.3187889581478183, + "frac_reward_zero_std": 1.0, + "grad_norm": 16.39630699157715, + "kl": 2.6740624085068703, + "learning_rate": 3.7877288471053937e-06, + "loss": 0.10633575171232224, + "num_tokens": 6382526.0, + "reward": 0.17500000447034836, + "reward_std": 0.29398737102746964, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2939873933792114, + "step": 716, + "step_time": 46.1464865390044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.25, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 140.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 33.25, + "completions/mean_terminated_length": 18.142857551574707, + "completions/min_length": 16.5, + "completions/min_terminated_length": 16.5, + "epoch": 0.3196794300979519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09907568991184235, + "kl": 3.2122897580266, + "learning_rate": 3.782780801583375e-06, + "loss": 0.12161869555711746, + "num_tokens": 6399386.0, + "reward": 0.07500000298023224, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.07500000298023224, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 718, + "step_time": 46.925280134506465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 54.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 54.375, + "completions/mean_terminated_length": 54.375, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.3205699020480855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2224346101284027, + "kl": 1.7517793476581573, + "learning_rate": 3.777832756061356e-06, + "loss": 0.052426815032958984, + "num_tokens": 6421864.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 720, + "step_time": 73.51747539999997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.0625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 139.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 32.0625, + "completions/mean_terminated_length": 16.830357551574707, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.32146037399821903, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.532606601715088, + "kl": 2.519416108727455, + "learning_rate": 3.7728847105393374e-06, + "loss": 0.09540431946516037, + "num_tokens": 6433569.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 722, + "step_time": 44.58530493949365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 141.5, + "completions/max_terminated_length": 17.5, + "completions/mean_length": 27.875, + "completions/mean_terminated_length": 12.196428775787354, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.3223508459483526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13162082433700562, + "kl": 5.542090013623238, + "learning_rate": 3.7679366650173183e-06, + "loss": 0.21324403584003448, + "num_tokens": 6446543.0, + "reward": 0.1750000026077032, + "reward_std": 0.40089186280965805, + "rewards/reward_financial_reasoning/mean": 0.1750000026077032, + "rewards/reward_financial_reasoning/std": 0.40089186280965805, + "step": 724, + "step_time": 46.05484523400082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 145.5, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 32.125, + "completions/mean_terminated_length": 16.66964292526245, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3232413178984862, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7261240482330322, + "kl": 4.556912407279015, + "learning_rate": 3.7629886194952997e-06, + "loss": 0.23144212365150452, + "num_tokens": 6454881.0, + "reward": 0.5374999940395355, + "reward_std": 0.47036218643188477, + "rewards/reward_financial_reasoning/mean": 0.5374999940395355, + "rewards/reward_financial_reasoning/std": 0.47036220133304596, + "step": 726, + "step_time": 43.249197524495685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.0, + "completions/max_terminated_length": 47.0, + "completions/mean_length": 20.625, + "completions/mean_terminated_length": 20.625, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.3241317898486198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1306663453578949, + "kl": 2.114730104804039, + "learning_rate": 3.7580405739732806e-06, + "loss": 0.08357012271881104, + "num_tokens": 6470147.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 728, + "step_time": 21.64191951349494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 40.0, + "completions/max_terminated_length": 40.0, + "completions/mean_length": 26.8125, + "completions/mean_terminated_length": 26.8125, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.3250222617987533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6427158117294312, + "kl": 2.4815937876701355, + "learning_rate": 3.753092528451262e-06, + "loss": 0.0976918414235115, + "num_tokens": 6492520.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 730, + "step_time": 24.04095314299775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 61.25, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 135.0, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 61.25, + "completions/mean_terminated_length": 17.487500190734863, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.3259127337488869, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.49971044063568115, + "kl": 2.464243160560727, + "learning_rate": 3.7481444829292434e-06, + "loss": 0.20906202495098114, + "num_tokens": 6518100.0, + "reward": -0.16250000149011612, + "reward_std": 0.34886594116687775, + "rewards/reward_financial_reasoning/mean": -0.16250000149011612, + "rewards/reward_financial_reasoning/std": 0.34886594116687775, + "step": 732, + "step_time": 53.07256942349704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 44.375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 178.5, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 44.375, + "completions/mean_terminated_length": 29.98214340209961, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.3268032056990205, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2685043811798096, + "kl": 2.9134768545627594, + "learning_rate": 3.7431964374072243e-06, + "loss": 0.045311249792575836, + "num_tokens": 6540474.0, + "reward": 0.012500000186264515, + "reward_std": 0.35391390323638916, + "rewards/reward_financial_reasoning/mean": 0.012500000186264515, + "rewards/reward_financial_reasoning/std": 0.35391390323638916, + "step": 734, + "step_time": 62.79009689700615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 76.0625, + "completions/clipped_ratio": 0.25, + "completions/max_length": 147.0, + "completions/max_terminated_length": 28.0, + "completions/mean_length": 76.0625, + "completions/mean_terminated_length": 16.5625, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.327693677649154, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8403189778327942, + "kl": 2.6961580216884613, + "learning_rate": 3.7382483918852057e-06, + "loss": 0.08386936783790588, + "num_tokens": 6554147.0, + "reward": 0.2875000089406967, + "reward_std": 0.49708831310272217, + "rewards/reward_financial_reasoning/mean": 0.2875000089406967, + "rewards/reward_financial_reasoning/std": 0.49708834290504456, + "step": 736, + "step_time": 47.54676332350209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 56.25, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 56.25, + "completions/mean_terminated_length": 9.880952596664429, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.3285841495992876, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.19178739190101624, + "kl": 7.0429524183273315, + "learning_rate": 3.7333003463631866e-06, + "loss": 0.14826089143753052, + "num_tokens": 6569375.0, + "reward": 0.48749998956918716, + "reward_std": 0.2176603004336357, + "rewards/reward_financial_reasoning/mean": 0.48749998956918716, + "rewards/reward_financial_reasoning/std": 0.2176603153347969, + "step": 738, + "step_time": 77.7282326079985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.25, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.5, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 26.25, + "completions/mean_terminated_length": 10.723214387893677, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.3294746215494212, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.231341361999512, + "kl": 2.5500387847423553, + "learning_rate": 3.728352300841168e-06, + "loss": 0.09851864725351334, + "num_tokens": 6586515.0, + "reward": 0.05000000260770321, + "reward_std": 0.42761795967817307, + "rewards/reward_financial_reasoning/mean": 0.05000000260770321, + "rewards/reward_financial_reasoning/std": 0.42761795967817307, + "step": 740, + "step_time": 47.18414942449817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.5, + "completions/max_terminated_length": 34.5, + "completions/mean_length": 16.5625, + "completions/mean_terminated_length": 16.5625, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.3303650934995548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2784968912601471, + "kl": 2.6422544419765472, + "learning_rate": 3.723404255319149e-06, + "loss": 0.09954223781824112, + "num_tokens": 6605620.0, + "reward": -0.30000000447034836, + "reward_std": 0.21380899101495743, + "rewards/reward_financial_reasoning/mean": -0.30000000447034836, + "rewards/reward_financial_reasoning/std": 0.21380899101495743, + "step": 742, + "step_time": 19.979887659497763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 12.4375, + "completions/mean_terminated_length": 12.4375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.3312555654496883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17147980630397797, + "kl": 3.159319758415222, + "learning_rate": 3.7184562097971303e-06, + "loss": 0.1264461874961853, + "num_tokens": 6625067.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 744, + "step_time": 14.4231962595004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 43.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.5, + "completions/max_terminated_length": 132.5, + "completions/mean_length": 43.9375, + "completions/mean_terminated_length": 43.9375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.3321460373998219, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.28793638944625854, + "kl": 6.326580494642258, + "learning_rate": 3.7135081642751113e-06, + "loss": 0.20715469121932983, + "num_tokens": 6640762.0, + "reward": 0.4375000149011612, + "reward_std": 0.44363605976104736, + "rewards/reward_financial_reasoning/mean": 0.4375000149011612, + "rewards/reward_financial_reasoning/std": 0.44363610446453094, + "step": 746, + "step_time": 43.157302853003785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.5, + "completions/max_terminated_length": 40.5, + "completions/mean_length": 40.625, + "completions/mean_terminated_length": 26.8125, + "completions/min_length": 18.5, + "completions/min_terminated_length": 18.5, + "epoch": 0.3330365093499555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.47648006677627563, + "kl": 2.5065064430236816, + "learning_rate": 3.7085601187530927e-06, + "loss": 0.09898876398801804, + "num_tokens": 6663932.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 748, + "step_time": 51.96744587050125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.5, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 143.0, + "completions/max_terminated_length": 33.0, + "completions/mean_length": 41.5, + "completions/mean_terminated_length": 27.48214340209961, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.333926981300089, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.8360135555267334, + "kl": 1.4322513043880463, + "learning_rate": 3.7036120732310745e-06, + "loss": 0.07792390882968903, + "num_tokens": 6685132.0, + "reward": 0.26250001043081284, + "reward_std": 0.3512909263372421, + "rewards/reward_financial_reasoning/mean": 0.26250001043081284, + "rewards/reward_financial_reasoning/std": 0.3512909561395645, + "step": 750, + "step_time": 51.64344793999771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.5, + "completions/max_terminated_length": 32.5, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.3348174532502226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11603382974863052, + "kl": 2.018613636493683, + "learning_rate": 3.698664027709055e-06, + "loss": 0.09922308474779129, + "num_tokens": 6706376.0, + "reward": 0.02499999850988388, + "reward_std": 0.45434410870075226, + "rewards/reward_financial_reasoning/mean": 0.02499999850988388, + "rewards/reward_financial_reasoning/std": 0.45434410870075226, + "step": 752, + "step_time": 20.996847507503844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.75, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 140.0, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 35.75, + "completions/mean_terminated_length": 20.9375, + "completions/min_length": 19.5, + "completions/min_terminated_length": 19.5, + "epoch": 0.3357079252003562, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3406031131744385, + "kl": 3.415686219930649, + "learning_rate": 3.6937159821870368e-06, + "loss": 0.23348277807235718, + "num_tokens": 6727876.0, + "reward": -0.13750001043081284, + "reward_std": 0.2256779968738556, + "rewards/reward_financial_reasoning/mean": -0.13750001043081284, + "rewards/reward_financial_reasoning/std": 0.2256779968738556, + "step": 754, + "step_time": 51.19353974749902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.5, + "completions/max_terminated_length": 16.5, + "completions/mean_length": 10.625, + "completions/mean_terminated_length": 10.625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.3365983971504898, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23292890191078186, + "kl": 3.3537650406360626, + "learning_rate": 3.6887679366650177e-06, + "loss": 0.12877531349658966, + "num_tokens": 6742294.0, + "reward": 0.30000001192092896, + "reward_std": 0.5345224589109421, + "rewards/reward_financial_reasoning/mean": 0.30000001192092896, + "rewards/reward_financial_reasoning/std": 0.5345224738121033, + "step": 756, + "step_time": 12.437249256501673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 209.5, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 42.4375, + "completions/mean_terminated_length": 27.973215103149414, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.3374888691006233, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.973015785217285, + "kl": 2.390896290540695, + "learning_rate": 3.683819891142999e-06, + "loss": 0.17431329190731049, + "num_tokens": 6759933.0, + "reward": 0.38750001788139343, + "reward_std": 0.39018382132053375, + "rewards/reward_financial_reasoning/mean": 0.38750001788139343, + "rewards/reward_financial_reasoning/std": 0.39018386602401733, + "step": 758, + "step_time": 67.14941029350302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 50.1875, + "completions/clipped_ratio": 0.125, + "completions/max_length": 146.0, + "completions/max_terminated_length": 30.5, + "completions/mean_length": 50.1875, + "completions/mean_terminated_length": 20.83333396911621, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.3383793410507569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08029485493898392, + "kl": 2.803036607801914, + "learning_rate": 3.67887184562098e-06, + "loss": 0.13053226470947266, + "num_tokens": 6777072.0, + "reward": 0.0, + "reward_std": 0.35675284266471863, + "rewards/reward_financial_reasoning/mean": 0.0, + "rewards/reward_financial_reasoning/std": 0.3567528575658798, + "step": 760, + "step_time": 49.21179148500232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 64.9375, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 133.5, + "completions/max_terminated_length": 95.5, + "completions/mean_length": 64.9375, + "completions/mean_terminated_length": 24.287500381469727, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.3392698130008905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07963932305574417, + "kl": 1.750863203778863, + "learning_rate": 3.6739238000989614e-06, + "loss": 0.07047466188669205, + "num_tokens": 6792711.0, + "reward": 0.19999998807907104, + "reward_std": 0.05345224589109421, + "rewards/reward_financial_reasoning/mean": 0.19999998807907104, + "rewards/reward_financial_reasoning/std": 0.05345224589109421, + "step": 762, + "step_time": 45.05649412400089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 60.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 32.3125, + "completions/mean_terminated_length": 32.3125, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.340160284951024, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.80032479763031, + "kl": 2.7984056919813156, + "learning_rate": 3.6689757545769424e-06, + "loss": 0.09850553423166275, + "num_tokens": 6815724.0, + "reward": -0.15000000223517418, + "reward_std": 0.05345224589109421, + "rewards/reward_financial_reasoning/mean": -0.15000000223517418, + "rewards/reward_financial_reasoning/std": 0.05345224589109421, + "step": 764, + "step_time": 29.943595519500377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.5, + "completions/max_terminated_length": 30.5, + "completions/mean_length": 20.3125, + "completions/mean_terminated_length": 20.3125, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.3410507569011576, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.911179780960083, + "kl": 2.389465108513832, + "learning_rate": 3.6640277090549238e-06, + "loss": 0.0941983088850975, + "num_tokens": 6831737.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 766, + "step_time": 17.419960692499444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 15.8125, + "completions/mean_terminated_length": 15.8125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3419412288512912, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.5256670713424683, + "kl": 3.1901561617851257, + "learning_rate": 3.6590796635329047e-06, + "loss": 0.12186402082443237, + "num_tokens": 6847790.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 768, + "step_time": 15.88296320849986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.5, + "completions/max_terminated_length": 37.5, + "completions/mean_length": 21.8125, + "completions/mean_terminated_length": 21.8125, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.3428317008014248, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.21360969543457, + "kl": 2.3833318948745728, + "learning_rate": 3.654131618010886e-06, + "loss": 0.1340945065021515, + "num_tokens": 6867195.0, + "reward": 0.06250000093132257, + "reward_std": 0.1060660183429718, + "rewards/reward_financial_reasoning/mean": 0.06250000093132257, + "rewards/reward_financial_reasoning/std": 0.1060660183429718, + "step": 770, + "step_time": 21.20947382099621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 137.5, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 31.5625, + "completions/mean_terminated_length": 17.017857551574707, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3437221727515583, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.20955723524093628, + "kl": 2.9231202751398087, + "learning_rate": 3.6491835724888675e-06, + "loss": 0.17583100497722626, + "num_tokens": 6884020.0, + "reward": -0.0875000013038516, + "reward_std": 0.266422763466835, + "rewards/reward_financial_reasoning/mean": -0.0875000013038516, + "rewards/reward_financial_reasoning/std": 0.266422763466835, + "step": 772, + "step_time": 47.02959246299724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.0, + "completions/max_terminated_length": 49.0, + "completions/mean_length": 20.0, + "completions/mean_terminated_length": 20.0, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.3446126447016919, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1046200692653656, + "kl": 2.869453191757202, + "learning_rate": 3.6442355269668484e-06, + "loss": 0.1020328626036644, + "num_tokens": 6904068.0, + "reward": 0.17500000447034836, + "reward_std": 0.40089185535907745, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.40089187026023865, + "step": 774, + "step_time": 24.619027701508458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.5, + "completions/max_terminated_length": 13.5, + "completions/mean_length": 12.4375, + "completions/mean_terminated_length": 12.4375, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.3455031166518255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09514451026916504, + "kl": 22.426249851079774, + "learning_rate": 3.6392874814448298e-06, + "loss": 0.939132571220398, + "num_tokens": 6919491.0, + "reward": 0.6749999970197678, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": 0.6749999970197678, + "rewards/reward_financial_reasoning/std": 0.24053513258695602, + "step": 776, + "step_time": 13.205288213495805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 137.0, + "completions/max_terminated_length": 28.5, + "completions/mean_length": 25.4375, + "completions/mean_terminated_length": 10.312500476837158, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.346393588601959, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6481949090957642, + "kl": 3.8522638976573944, + "learning_rate": 3.6343394359228107e-06, + "loss": 0.21328844130039215, + "num_tokens": 6936298.0, + "reward": 0.23750001192092896, + "reward_std": 0.5505405366420746, + "rewards/reward_financial_reasoning/mean": 0.23750001192092896, + "rewards/reward_financial_reasoning/std": 0.550540566444397, + "step": 778, + "step_time": 47.54922737349989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 9.9375, + "completions/mean_terminated_length": 9.9375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.3472840605520926, + "frac_reward_zero_std": 0.75, + "grad_norm": 21.361452102661133, + "kl": 6.1261356472969055, + "learning_rate": 3.629391390400792e-06, + "loss": 0.28381842374801636, + "num_tokens": 6956857.0, + "reward": 0.6124999970197678, + "reward_std": 0.39018382132053375, + "rewards/reward_financial_reasoning/mean": 0.6124999970197678, + "rewards/reward_financial_reasoning/std": 0.39018385112285614, + "step": 780, + "step_time": 16.095249286994658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.25, + "completions/clipped_ratio": 0.125, + "completions/max_length": 256.0, + "completions/max_terminated_length": 14.5, + "completions/mean_length": 42.25, + "completions/mean_terminated_length": 11.714285850524902, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3481745325022262, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.043459415435791, + "kl": 2.4115554690361023, + "learning_rate": 3.624443344878773e-06, + "loss": 0.18826088309288025, + "num_tokens": 6980629.0, + "reward": 0.2874999865889549, + "reward_std": 0.36345769464969635, + "rewards/reward_financial_reasoning/mean": 0.2874999865889549, + "rewards/reward_financial_reasoning/std": 0.36345770955085754, + "step": 782, + "step_time": 86.04697552700236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 134.0, + "completions/max_terminated_length": 27.5, + "completions/mean_length": 28.875, + "completions/mean_terminated_length": 14.008929252624512, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3490650044523598, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.16503266990184784, + "kl": 1.425231909379363, + "learning_rate": 3.6194952993567544e-06, + "loss": -0.029601268470287323, + "num_tokens": 6996299.0, + "reward": 0.03749999403953552, + "reward_std": 0.5514859259128571, + "rewards/reward_financial_reasoning/mean": 0.03749999403953552, + "rewards/reward_financial_reasoning/std": 0.5514859408140182, + "step": 784, + "step_time": 46.38959728450209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 108.875, + "completions/clipped_ratio": 0.375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 32.5, + "completions/mean_length": 108.875, + "completions/mean_terminated_length": 18.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.3499554764024933, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.0817203521728516, + "kl": 2.4215634018182755, + "learning_rate": 3.6145472538347354e-06, + "loss": 0.07867254316806793, + "num_tokens": 7012225.0, + "reward": 0.03749999403953552, + "reward_std": 0.35606882721185684, + "rewards/reward_financial_reasoning/mean": 0.03749999403953552, + "rewards/reward_financial_reasoning/std": 0.35606882721185684, + "step": 786, + "step_time": 77.41537013850393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 53.5, + "completions/max_terminated_length": 53.5, + "completions/mean_length": 19.9375, + "completions/mean_terminated_length": 19.9375, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.3508459483526269, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1960225105285645, + "kl": 3.1713290363550186, + "learning_rate": 3.6095992083127167e-06, + "loss": 0.11753413081169128, + "num_tokens": 7032832.0, + "reward": 0.3499999865889549, + "reward_std": 0.21380899101495743, + "rewards/reward_financial_reasoning/mean": 0.3499999865889549, + "rewards/reward_financial_reasoning/std": 0.21380899101495743, + "step": 788, + "step_time": 26.475698017497052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 30.5, + "completions/max_terminated_length": 30.5, + "completions/mean_length": 18.1875, + "completions/mean_terminated_length": 18.1875, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.3517364203027605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3880739212036133, + "kl": 1.8633519411087036, + "learning_rate": 3.6046511627906977e-06, + "loss": 0.07043511420488358, + "num_tokens": 7052539.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 790, + "step_time": 19.250361710492143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 51.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 144.5, + "completions/max_terminated_length": 36.5, + "completions/mean_length": 51.0, + "completions/mean_terminated_length": 21.08333396911621, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.352626892252894, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.956282138824463, + "kl": 3.7996502816677094, + "learning_rate": 3.599703117268679e-06, + "loss": 0.10686469823122025, + "num_tokens": 7075259.0, + "reward": 0.20000000670552254, + "reward_std": 0.3760698735713959, + "rewards/reward_financial_reasoning/mean": 0.20000000670552254, + "rewards/reward_financial_reasoning/std": 0.37606990337371826, + "step": 792, + "step_time": 52.96954656899834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 140.0, + "completions/max_terminated_length": 34.0, + "completions/mean_length": 30.125, + "completions/mean_terminated_length": 15.428571701049805, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.3535173642030276, + "frac_reward_zero_std": 0.75, + "grad_norm": 1997.9693603515625, + "kl": 342.5339174568653, + "learning_rate": 3.5947550717466604e-06, + "loss": 9.447096824645996, + "num_tokens": 7092589.0, + "reward": 0.36250001192092896, + "reward_std": 0.5238144397735596, + "rewards/reward_financial_reasoning/mean": 0.36250001192092896, + "rewards/reward_financial_reasoning/std": 0.523814469575882, + "step": 794, + "step_time": 47.50602113500645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 55.5, + "completions/max_terminated_length": 55.5, + "completions/mean_length": 17.5625, + "completions/mean_terminated_length": 17.5625, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.3544078361531612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4262450933456421, + "kl": 2.2002196609973907, + "learning_rate": 3.5898070262246414e-06, + "loss": 0.09302681684494019, + "num_tokens": 7111166.0, + "reward": -0.2749999985098839, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.2749999985098839, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 796, + "step_time": 25.643957539497933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.35529830810329477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12062408030033112, + "kl": 2.885861501097679, + "learning_rate": 3.5848589807026228e-06, + "loss": 0.11578873544931412, + "num_tokens": 7133662.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 798, + "step_time": 18.64683330200205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.9375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 182.0, + "completions/max_terminated_length": 63.5, + "completions/mean_length": 34.9375, + "completions/mean_terminated_length": 19.535714626312256, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.3561887800534283, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.21271134912967682, + "kl": 2.7550200819969177, + "learning_rate": 3.5799109351806037e-06, + "loss": 0.15305285155773163, + "num_tokens": 7154069.0, + "reward": -0.03750000149011612, + "reward_std": 0.4317670986056328, + "rewards/reward_financial_reasoning/mean": -0.03750000149011612, + "rewards/reward_financial_reasoning/std": 0.4317671060562134, + "step": 800, + "step_time": 60.9049749980004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 49.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 158.0, + "completions/max_terminated_length": 52.0, + "completions/mean_length": 49.4375, + "completions/mean_terminated_length": 35.66964340209961, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.3570792520035619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05105382576584816, + "kl": 1.9183711856603622, + "learning_rate": 3.574962889658585e-06, + "loss": 0.05830772966146469, + "num_tokens": 7169716.0, + "reward": -0.04999999701976776, + "reward_std": 0.37416573613882065, + "rewards/reward_financial_reasoning/mean": -0.04999999701976776, + "rewards/reward_financial_reasoning/std": 0.37416573613882065, + "step": 802, + "step_time": 53.53614674049095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 107.375, + "completions/clipped_ratio": 0.375, + "completions/max_length": 256.0, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 107.375, + "completions/mean_terminated_length": 19.166666984558105, + "completions/min_length": 18.5, + "completions/min_terminated_length": 18.5, + "epoch": 0.3579697239536955, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.246635913848877, + "kl": 25.036666467785835, + "learning_rate": 3.570014844136566e-06, + "loss": 1.1702524423599243, + "num_tokens": 7187594.0, + "reward": 0.07500000111758709, + "reward_std": 0.276574470102787, + "rewards/reward_financial_reasoning/mean": 0.07500000111758709, + "rewards/reward_financial_reasoning/std": 0.2765744850039482, + "step": 804, + "step_time": 80.11308830999769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.5625, + "completions/clipped_ratio": 0.125, + "completions/max_length": 140.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 46.5625, + "completions/mean_terminated_length": 16.64583396911621, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.358860195903829, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.19011610746383667, + "kl": 2.6784499436616898, + "learning_rate": 3.565066798614548e-06, + "loss": 0.20479558408260345, + "num_tokens": 7196979.0, + "reward": 0.17500000447034836, + "reward_std": 0.2314550280570984, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2314550280570984, + "step": 806, + "step_time": 43.39767270999437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.6875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 29.6875, + "completions/mean_terminated_length": 14.875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3597506678539626, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1912928819656372, + "kl": 3.8488663136959076, + "learning_rate": 3.5601187530925284e-06, + "loss": 0.1310974359512329, + "num_tokens": 7214110.0, + "reward": 0.6250000149011612, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.6250000149011612, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 808, + "step_time": 48.412980310498824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 18.6875, + "completions/mean_terminated_length": 18.6875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3606411398040962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11078284680843353, + "kl": 2.806388795375824, + "learning_rate": 3.55517070757051e-06, + "loss": 0.10246068984270096, + "num_tokens": 7233401.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 810, + "step_time": 18.20412028949795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 17.625, + "completions/mean_terminated_length": 17.625, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.36153161175422976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17546121776103973, + "kl": 2.027177184820175, + "learning_rate": 3.5502226620484907e-06, + "loss": 0.08124741911888123, + "num_tokens": 7251419.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 812, + "step_time": 17.06027586250275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 49.6875, + "completions/clipped_ratio": 0.125, + "completions/max_length": 138.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 49.6875, + "completions/mean_terminated_length": 20.33333396911621, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3624220837043633, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.1511795073747635, + "kl": 2.3284588307142258, + "learning_rate": 3.5452746165264725e-06, + "loss": 0.19820040464401245, + "num_tokens": 7269582.0, + "reward": 0.02500000037252903, + "reward_std": 0.13887301087379456, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.13887301087379456, + "step": 814, + "step_time": 47.642690775999654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.5, + "completions/max_terminated_length": 26.5, + "completions/mean_length": 20.5625, + "completions/mean_terminated_length": 20.5625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.3633125556544969, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18224821984767914, + "kl": 3.339336931705475, + "learning_rate": 3.540326571004454e-06, + "loss": 0.13968926668167114, + "num_tokens": 7287047.0, + "reward": -0.20000001043081284, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -0.20000001043081284, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 816, + "step_time": 16.942964851496072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 83.5625, + "completions/clipped_ratio": 0.25, + "completions/max_length": 148.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 83.5625, + "completions/mean_terminated_length": 22.5625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.36420302760463047, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5113762617111206, + "kl": 1.7933121919631958, + "learning_rate": 3.535378525482435e-06, + "loss": 0.08266621828079224, + "num_tokens": 7302512.0, + "reward": 0.17500000074505806, + "reward_std": 0.4242233335971832, + "rewards/reward_financial_reasoning/mean": 0.17500000074505806, + "rewards/reward_financial_reasoning/std": 0.4242233335971832, + "step": 818, + "step_time": 47.5636702519987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.0, + "completions/max_terminated_length": 29.0, + "completions/mean_length": 16.875, + "completions/mean_terminated_length": 16.875, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.365093499554764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2267179638147354, + "kl": 4.005930542945862, + "learning_rate": 3.530430479960416e-06, + "loss": 0.1383998841047287, + "num_tokens": 7322222.0, + "reward": -0.12500000186264515, + "reward_std": 0.24053511768579483, + "rewards/reward_financial_reasoning/mean": -0.12500000186264515, + "rewards/reward_financial_reasoning/std": 0.24053511768579483, + "step": 820, + "step_time": 19.202351930998702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.0, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 23.75, + "completions/mean_terminated_length": 23.75, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.3659839715048976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08866865932941437, + "kl": 2.016817420721054, + "learning_rate": 3.525482434438397e-06, + "loss": 0.08069662004709244, + "num_tokens": 7339330.0, + "reward": 0.17500000447034836, + "reward_std": 0.29398737102746964, + "rewards/reward_financial_reasoning/mean": 0.17500000447034836, + "rewards/reward_financial_reasoning/std": 0.2939873933792114, + "step": 822, + "step_time": 18.552810501500062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 10.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13.0, + "completions/max_terminated_length": 13.0, + "completions/mean_length": 10.375, + "completions/mean_terminated_length": 10.375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.3668744434550312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1785629242658615, + "kl": 2.7913960814476013, + "learning_rate": 3.5205343889163785e-06, + "loss": 0.11267250776290894, + "num_tokens": 7356440.0, + "reward": -0.20000000298023224, + "reward_std": 0.32071349024772644, + "rewards/reward_financial_reasoning/mean": -0.20000000298023224, + "rewards/reward_financial_reasoning/std": 0.32071349024772644, + "step": 824, + "step_time": 13.125887238500582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 15.4375, + "completions/mean_terminated_length": 15.4375, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.36776491540516476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34747812151908875, + "kl": 2.4959478676319122, + "learning_rate": 3.5155863433943594e-06, + "loss": 0.09693935513496399, + "num_tokens": 7375239.0, + "reward": 0.1250000074505806, + "reward_std": 0.34743961691856384, + "rewards/reward_financial_reasoning/mean": 0.1250000074505806, + "rewards/reward_financial_reasoning/std": 0.34743963181972504, + "step": 826, + "step_time": 16.67322523299299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 11.375, + "completions/mean_terminated_length": 11.375, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.3686553873552983, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.09147421270608902, + "kl": 4.733070477843285, + "learning_rate": 3.510638297872341e-06, + "loss": 0.16881364583969116, + "num_tokens": 7393853.0, + "reward": 0.13750000670552254, + "reward_std": 0.36228442192077637, + "rewards/reward_financial_reasoning/mean": 0.13750000670552254, + "rewards/reward_financial_reasoning/std": 0.36228442192077637, + "step": 828, + "step_time": 15.064109992996237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 15.5625, + "completions/mean_terminated_length": 15.5625, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.3695458593054319, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.8771681785583496, + "kl": 3.5439845621585846, + "learning_rate": 3.5056902523503218e-06, + "loss": 0.15652626752853394, + "num_tokens": 7415678.0, + "reward": 0.10000000149011612, + "reward_std": 0.1963960975408554, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.1963960975408554, + "step": 830, + "step_time": 18.503596431502956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 22.4375, + "completions/mean_terminated_length": 22.4375, + "completions/min_length": 17.5, + "completions/min_terminated_length": 17.5, + "epoch": 0.37043633125556547, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.17112646996974945, + "kl": 2.819959133863449, + "learning_rate": 3.500742206828303e-06, + "loss": 0.12506897747516632, + "num_tokens": 7429525.0, + "reward": 0.20000000670552254, + "reward_std": 0.2905927151441574, + "rewards/reward_financial_reasoning/mean": 0.20000000670552254, + "rewards/reward_financial_reasoning/std": 0.2905927300453186, + "step": 832, + "step_time": 14.886599326004216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.5, + "completions/max_terminated_length": 21.5, + "completions/mean_length": 15.8125, + "completions/mean_terminated_length": 15.8125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.371326803205699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14576126635074615, + "kl": 2.9340180456638336, + "learning_rate": 3.4957941613062845e-06, + "loss": 0.11859500408172607, + "num_tokens": 7448242.0, + "reward": -0.15000000223517418, + "reward_std": 0.05345224589109421, + "rewards/reward_financial_reasoning/mean": -0.15000000223517418, + "rewards/reward_financial_reasoning/std": 0.05345224589109421, + "step": 834, + "step_time": 16.496135844503442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 42.5, + "completions/max_terminated_length": 42.5, + "completions/mean_length": 30.5, + "completions/mean_terminated_length": 30.5, + "completions/min_length": 22.5, + "completions/min_terminated_length": 22.5, + "epoch": 0.3722172751558326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11725213378667831, + "kl": 1.9590845555067062, + "learning_rate": 3.4908461157842655e-06, + "loss": 0.07787135988473892, + "num_tokens": 7465378.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 836, + "step_time": 21.61521376399469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 32.5, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.0, + "completions/max_terminated_length": 26.0, + "completions/mean_length": 32.5, + "completions/mean_terminated_length": 17.589285850524902, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.37310774710596617, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.323338031768799, + "kl": 2.400451347231865, + "learning_rate": 3.485898070262247e-06, + "loss": 0.17991583049297333, + "num_tokens": 7479754.0, + "reward": 0.2875000089406967, + "reward_std": 0.4670701175928116, + "rewards/reward_financial_reasoning/mean": 0.2875000089406967, + "rewards/reward_financial_reasoning/std": 0.4670701324939728, + "step": 838, + "step_time": 46.93090354149899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.5, + "completions/max_terminated_length": 87.5, + "completions/mean_length": 30.5625, + "completions/mean_terminated_length": 30.5625, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.37399821905609976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14068838953971863, + "kl": 2.13110613822937, + "learning_rate": 3.480950024740228e-06, + "loss": 0.08287981897592545, + "num_tokens": 7502115.0, + "reward": 0.1250000037252903, + "reward_std": 0.34743958711624146, + "rewards/reward_financial_reasoning/mean": 0.1250000037252903, + "rewards/reward_financial_reasoning/std": 0.34743958711624146, + "step": 840, + "step_time": 38.0494445104996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 35.5, + "completions/max_terminated_length": 35.5, + "completions/mean_length": 27.1875, + "completions/mean_terminated_length": 27.1875, + "completions/min_length": 17.5, + "completions/min_terminated_length": 17.5, + "epoch": 0.3748886910062333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2852267026901245, + "kl": 2.796256124973297, + "learning_rate": 3.476001979218209e-06, + "loss": 0.11120793223381042, + "num_tokens": 7519318.0, + "reward": 0.07500000298023224, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.07500000298023224, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 842, + "step_time": 19.269342353996763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.375, + "completions/mean_terminated_length": 13.375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.3757791629563669, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4274088740348816, + "kl": 3.7996322214603424, + "learning_rate": 3.47105393369619e-06, + "loss": 0.12690269947052002, + "num_tokens": 7541116.0, + "reward": 0.1250000074505806, + "reward_std": 0.34743961691856384, + "rewards/reward_financial_reasoning/mean": 0.1250000074505806, + "rewards/reward_financial_reasoning/std": 0.34743963181972504, + "step": 844, + "step_time": 18.219404364499496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.1875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 17.5, + "completions/max_terminated_length": 17.5, + "completions/mean_length": 13.1875, + "completions/mean_terminated_length": 13.1875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.37666963490650046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23236680030822754, + "kl": 2.7688030302524567, + "learning_rate": 3.4661058881741715e-06, + "loss": 0.10391891002655029, + "num_tokens": 7559863.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 846, + "step_time": 15.469687197000894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.1875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 157.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 37.1875, + "completions/mean_terminated_length": 21.991071701049805, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.377560106856634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13132241368293762, + "kl": 2.3280729204416275, + "learning_rate": 3.4611578426521524e-06, + "loss": 0.07760872691869736, + "num_tokens": 7581346.0, + "reward": 0.10000000894069672, + "reward_std": 0.21380899846553802, + "rewards/reward_financial_reasoning/mean": 0.10000000894069672, + "rewards/reward_financial_reasoning/std": 0.21380901336669922, + "step": 848, + "step_time": 57.75018220250058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 44.5, + "completions/clipped_ratio": 0.125, + "completions/max_length": 134.5, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 44.5, + "completions/mean_terminated_length": 14.625, + "completions/min_length": 14.5, + "completions/min_terminated_length": 14.5, + "epoch": 0.3784505788067676, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0872406959533691, + "kl": 1.559784710407257, + "learning_rate": 3.456209797130134e-06, + "loss": 0.11022936552762985, + "num_tokens": 7594410.0, + "reward": 0.2500000074505806, + "reward_std": 0.2577935457229614, + "rewards/reward_financial_reasoning/mean": 0.2500000074505806, + "rewards/reward_financial_reasoning/std": 0.2577935680747032, + "step": 850, + "step_time": 45.99375293400226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 134.5, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 37.0, + "completions/mean_terminated_length": 23.4375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.37934105075690117, + "frac_reward_zero_std": 0.75, + "grad_norm": 12.833792686462402, + "kl": 3.972812756896019, + "learning_rate": 3.4512617516081148e-06, + "loss": 0.2475769817829132, + "num_tokens": 7615866.0, + "reward": -0.16249999683350325, + "reward_std": 0.1862443909049034, + "rewards/reward_financial_reasoning/mean": -0.16249999683350325, + "rewards/reward_financial_reasoning/std": 0.1862443909049034, + "step": 852, + "step_time": 51.58180875749531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 47.5, + "completions/max_terminated_length": 47.5, + "completions/mean_length": 23.3125, + "completions/mean_terminated_length": 23.3125, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.38023152270703475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23057197034358978, + "kl": 3.2737006843090057, + "learning_rate": 3.446313706086096e-06, + "loss": 0.10352115333080292, + "num_tokens": 7638007.0, + "reward": -0.125, + "reward_std": 0.29398736357688904, + "rewards/reward_financial_reasoning/mean": -0.125, + "rewards/reward_financial_reasoning/std": 0.29398736357688904, + "step": 854, + "step_time": 26.51089113649141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 46.5, + "completions/max_terminated_length": 46.5, + "completions/mean_length": 12.4375, + "completions/mean_terminated_length": 12.4375, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.3811219946571683, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.749869346618652, + "kl": 3.574433773756027, + "learning_rate": 3.441365660564078e-06, + "loss": 0.08661140501499176, + "num_tokens": 7652638.0, + "reward": -0.21249999850988388, + "reward_std": 0.3803405165672302, + "rewards/reward_financial_reasoning/mean": -0.21249999850988388, + "rewards/reward_financial_reasoning/std": 0.3803405165672302, + "step": 856, + "step_time": 21.310353426000802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 59.75, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 142.5, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 59.75, + "completions/mean_terminated_length": 12.912500143051147, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.38201246660730187, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.19118884205818176, + "kl": 3.842833936214447, + "learning_rate": 3.4364176150420585e-06, + "loss": 0.18493984639644623, + "num_tokens": 7669394.0, + "reward": 0.23749998770654202, + "reward_std": 0.38225453346967697, + "rewards/reward_financial_reasoning/mean": 0.23749998770654202, + "rewards/reward_financial_reasoning/std": 0.3822545036673546, + "step": 858, + "step_time": 48.92800594100481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 13.875, + "completions/mean_terminated_length": 13.875, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.38290293855743546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16126124560832977, + "kl": 4.304675847291946, + "learning_rate": 3.4314695695200403e-06, + "loss": 0.1495298147201538, + "num_tokens": 7687552.0, + "reward": 0.3499999865889549, + "reward_std": 0.21380899101495743, + "rewards/reward_financial_reasoning/mean": 0.3499999865889549, + "rewards/reward_financial_reasoning/std": 0.21380899101495743, + "step": 860, + "step_time": 17.01638667500447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 9.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.5, + "completions/max_terminated_length": 14.5, + "completions/mean_length": 9.6875, + "completions/mean_terminated_length": 9.6875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.383793410507569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17267750203609467, + "kl": 4.147587463259697, + "learning_rate": 3.4265215239980208e-06, + "loss": 0.15131743252277374, + "num_tokens": 7699363.0, + "reward": 0.5249999910593033, + "reward_std": 0.40089183300733566, + "rewards/reward_financial_reasoning/mean": 0.5249999910593033, + "rewards/reward_financial_reasoning/std": 0.40089183300733566, + "step": 862, + "step_time": 10.703602764002426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 58.0625, + "completions/clipped_ratio": 0.125, + "completions/max_length": 136.5, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 58.0625, + "completions/mean_terminated_length": 32.1875, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.3846838824577026, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8160765767097473, + "kl": 2.1349672228097916, + "learning_rate": 3.4215734784760026e-06, + "loss": 0.06090102344751358, + "num_tokens": 7719948.0, + "reward": 0.23750001192092896, + "reward_std": 0.5119454711675644, + "rewards/reward_financial_reasoning/mean": 0.23750001192092896, + "rewards/reward_financial_reasoning/std": 0.5119454860687256, + "step": 864, + "step_time": 51.53723118200287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.75, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 28.75, + "completions/mean_terminated_length": 13.500000476837158, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.38557435440783616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14556773006916046, + "kl": 2.87981840968132, + "learning_rate": 3.416625432953983e-06, + "loss": 0.10689441114664078, + "num_tokens": 7741352.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 866, + "step_time": 51.90839271850564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 61.0625, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 61.0625, + "completions/mean_terminated_length": 16.321428775787354, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.38646482635796975, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.3730831146240234, + "kl": 1.4520172700285912, + "learning_rate": 3.411677387431965e-06, + "loss": 0.2352294623851776, + "num_tokens": 7761969.0, + "reward": 0.3125000149011612, + "reward_std": 0.49342095851898193, + "rewards/reward_financial_reasoning/mean": 0.3125000149011612, + "rewards/reward_financial_reasoning/std": 0.4934210181236267, + "step": 868, + "step_time": 84.79485128700253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 29.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.5, + "completions/max_terminated_length": 68.5, + "completions/mean_length": 29.6875, + "completions/mean_terminated_length": 29.6875, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.3873552983081033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8227584958076477, + "kl": 3.320344388484955, + "learning_rate": 3.406729341909946e-06, + "loss": 0.11288516968488693, + "num_tokens": 7775516.0, + "reward": 0.4000000059604645, + "reward_std": 0.5345224589109421, + "rewards/reward_financial_reasoning/mean": 0.4000000059604645, + "rewards/reward_financial_reasoning/std": 0.5345224738121033, + "step": 870, + "step_time": 26.270858613999735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 24.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.5, + "completions/max_terminated_length": 28.5, + "completions/mean_length": 24.9375, + "completions/mean_terminated_length": 24.9375, + "completions/min_length": 20.5, + "completions/min_terminated_length": 20.5, + "epoch": 0.38824577025823687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056442953646183014, + "kl": 1.64863820374012, + "learning_rate": 3.4017812963879272e-06, + "loss": 0.06417623907327652, + "num_tokens": 7790747.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 872, + "step_time": 16.352528546009125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 42.8125, + "completions/clipped_ratio": 0.125, + "completions/max_length": 138.5, + "completions/max_terminated_length": 17.5, + "completions/mean_length": 42.8125, + "completions/mean_terminated_length": 11.791666746139526, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.38913624220837045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22031089663505554, + "kl": 2.7123608253896236, + "learning_rate": 3.396833250865908e-06, + "loss": 0.10119879245758057, + "num_tokens": 7803032.0, + "reward": 0.37500000558793545, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.37500000558793545, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 874, + "step_time": 44.841470889492484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 25.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.5, + "completions/max_terminated_length": 31.5, + "completions/mean_length": 25.125, + "completions/mean_terminated_length": 25.125, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.390026714158504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15465430915355682, + "kl": 2.474703371524811, + "learning_rate": 3.3918852053438895e-06, + "loss": 0.09784232825040817, + "num_tokens": 7823546.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 876, + "step_time": 20.668088107500807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 48.5, + "completions/max_terminated_length": 48.5, + "completions/mean_length": 20.375, + "completions/mean_terminated_length": 20.375, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.39091718610863757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2885158956050873, + "kl": 2.143370568752289, + "learning_rate": 3.386937159821871e-06, + "loss": 0.08493253588676453, + "num_tokens": 7845432.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 878, + "step_time": 26.40964586049813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 36.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 20.4375, + "completions/mean_terminated_length": 20.4375, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.39180765805877116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3435024321079254, + "kl": 3.9613396525382996, + "learning_rate": 3.381989114299852e-06, + "loss": 0.14171364903450012, + "num_tokens": 7864639.0, + "reward": -0.07500000298023224, + "reward_std": 0.34743960946798325, + "rewards/reward_financial_reasoning/mean": -0.07500000298023224, + "rewards/reward_financial_reasoning/std": 0.34743960946798325, + "step": 880, + "step_time": 21.07708440350325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.5, + "completions/max_terminated_length": 59.5, + "completions/mean_length": 19.0, + "completions/mean_terminated_length": 19.0, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.39269813000890474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30529144406318665, + "kl": 2.770953595638275, + "learning_rate": 3.3770410687778332e-06, + "loss": 0.1095418706536293, + "num_tokens": 7881399.0, + "reward": 0.1250000037252903, + "reward_std": 0.34743958711624146, + "rewards/reward_financial_reasoning/mean": 0.1250000037252903, + "rewards/reward_financial_reasoning/std": 0.34743958711624146, + "step": 882, + "step_time": 26.48739288449724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 48.0625, + "completions/clipped_ratio": 0.125, + "completions/max_length": 138.5, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 48.0625, + "completions/mean_terminated_length": 17.9375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.3935886019590383, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.556560516357422, + "kl": 4.03894579410553, + "learning_rate": 3.372093023255814e-06, + "loss": 0.06743745505809784, + "num_tokens": 7902416.0, + "reward": -0.0624999962747097, + "reward_std": 0.219983771443367, + "rewards/reward_financial_reasoning/mean": -0.0624999962747097, + "rewards/reward_financial_reasoning/std": 0.219983771443367, + "step": 884, + "step_time": 51.95550639049543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.5, + "completions/max_terminated_length": 96.5, + "completions/mean_length": 34.5625, + "completions/mean_terminated_length": 34.5625, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.39447907390917186, + "frac_reward_zero_std": 1.0, + "grad_norm": 5.515413761138916, + "kl": 2.448286272585392, + "learning_rate": 3.3671449777337956e-06, + "loss": 0.09613915532827377, + "num_tokens": 7921649.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 886, + "step_time": 38.43645350350198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 50.8125, + "completions/clipped_ratio": 0.125, + "completions/max_length": 135.5, + "completions/max_terminated_length": 74.5, + "completions/mean_length": 50.8125, + "completions/mean_terminated_length": 22.9375, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.39536954585930545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16039247810840607, + "kl": 2.114430546760559, + "learning_rate": 3.3621969322117765e-06, + "loss": 0.06777290999889374, + "num_tokens": 7944982.0, + "reward": -0.1249999962747097, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": -0.1249999962747097, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 888, + "step_time": 52.79890574550154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 21.0625, + "completions/mean_terminated_length": 21.0625, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.396260017809439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21837559342384338, + "kl": 1.9849668145179749, + "learning_rate": 3.357248886689758e-06, + "loss": 0.07559309899806976, + "num_tokens": 7956239.0, + "reward": 0.424999987706542, + "reward_std": 0.13363061845302582, + "rewards/reward_financial_reasoning/mean": 0.424999987706542, + "rewards/reward_financial_reasoning/std": 0.13363061845302582, + "step": 890, + "step_time": 25.786644731997512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 18.8125, + "completions/mean_terminated_length": 18.8125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.39715048975957257, + "frac_reward_zero_std": 0.5, + "grad_norm": 11.93359088897705, + "kl": 2.411112889647484, + "learning_rate": 3.352300841167739e-06, + "loss": 0.003915680572390556, + "num_tokens": 7974396.0, + "reward": 0.13750000670552254, + "reward_std": 0.26692694425582886, + "rewards/reward_financial_reasoning/mean": 0.13750000670552254, + "rewards/reward_financial_reasoning/std": 0.26692697405815125, + "step": 892, + "step_time": 17.294510296509543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 11.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 11.3125, + "completions/mean_terminated_length": 11.3125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.39804096170970615, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.600701332092285, + "kl": 2.586108446121216, + "learning_rate": 3.34735279564572e-06, + "loss": 0.07793588191270828, + "num_tokens": 7987977.0, + "reward": 0.1375000085681677, + "reward_std": 0.28327932208776474, + "rewards/reward_financial_reasoning/mean": 0.1375000085681677, + "rewards/reward_financial_reasoning/std": 0.2832793518900871, + "step": 894, + "step_time": 11.401168355005211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.0, + "completions/max_terminated_length": 27.0, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 18.5, + "completions/min_terminated_length": 18.5, + "epoch": 0.39893143365983974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.44599446654319763, + "kl": 2.8776747286319733, + "learning_rate": 3.342404750123701e-06, + "loss": 0.11476292461156845, + "num_tokens": 8004293.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 896, + "step_time": 17.037187163503404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 141.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 31.375, + "completions/mean_terminated_length": 16.321428775787354, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.39982190560997327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2007230818271637, + "kl": 3.3882580399513245, + "learning_rate": 3.3374567046016825e-06, + "loss": 0.11634482443332672, + "num_tokens": 8022299.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 898, + "step_time": 50.946265338003286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 57.5, + "completions/max_terminated_length": 57.5, + "completions/mean_length": 21.875, + "completions/mean_terminated_length": 21.875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.40071237756010686, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.8853933811187744, + "kl": 2.4549517929553986, + "learning_rate": 3.332508659079664e-06, + "loss": 0.1362210363149643, + "num_tokens": 8040041.0, + "reward": 0.07499999925494194, + "reward_std": 0.0707106813788414, + "rewards/reward_financial_reasoning/mean": 0.07499999925494194, + "rewards/reward_financial_reasoning/std": 0.0707106813788414, + "step": 900, + "step_time": 25.934856780499103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.0625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 143.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 27.0625, + "completions/mean_terminated_length": 11.500000238418579, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.40160284951024044, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.213332176208496, + "kl": 5.65405336022377, + "learning_rate": 3.327560613557645e-06, + "loss": 0.2474951297044754, + "num_tokens": 8067754.0, + "reward": -0.17500000074505806, + "reward_std": 0.39675553888082504, + "rewards/reward_financial_reasoning/mean": -0.17500000074505806, + "rewards/reward_financial_reasoning/std": 0.39675553888082504, + "step": 902, + "step_time": 58.80656282400014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.8125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.5, + "completions/max_terminated_length": 19.5, + "completions/mean_length": 30.8125, + "completions/mean_terminated_length": 15.892857551574707, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.402493321460374, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.881185054779053, + "kl": 2.6317369118332863, + "learning_rate": 3.3226125680356262e-06, + "loss": 0.18388396501541138, + "num_tokens": 8085895.0, + "reward": 0.1375000085681677, + "reward_std": 0.28327932208776474, + "rewards/reward_financial_reasoning/mean": 0.1375000085681677, + "rewards/reward_financial_reasoning/std": 0.2832793518900871, + "step": 904, + "step_time": 49.99714060999759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 21.4375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 28.5, + "completions/max_terminated_length": 28.5, + "completions/mean_length": 21.4375, + "completions/mean_terminated_length": 21.4375, + "completions/min_length": 17.5, + "completions/min_terminated_length": 17.5, + "epoch": 0.40338379341050756, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.09603514522314072, + "kl": 3.4366951882839203, + "learning_rate": 3.317664522513607e-06, + "loss": 0.14981061220169067, + "num_tokens": 8108566.0, + "reward": 0.11250000074505806, + "reward_std": 0.27998724579811096, + "rewards/reward_financial_reasoning/mean": 0.11250000074505806, + "rewards/reward_financial_reasoning/std": 0.27998724579811096, + "step": 906, + "step_time": 21.12849767700027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 17.75, + "completions/mean_terminated_length": 17.75, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.40427426536064115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22057682275772095, + "kl": 1.7570404410362244, + "learning_rate": 3.3127164769915886e-06, + "loss": 0.06977805495262146, + "num_tokens": 8123674.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 908, + "step_time": 15.24695787949895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.40516473731077474, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6793663501739502, + "kl": 2.9984846711158752, + "learning_rate": 3.3077684314695695e-06, + "loss": 0.10924842953681946, + "num_tokens": 8143346.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 910, + "step_time": 17.416411400998186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 34.9375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 34.9375, + "completions/mean_terminated_length": 34.9375, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.40605520926090827, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6713170409202576, + "kl": 2.7019808292388916, + "learning_rate": 3.302820385947551e-06, + "loss": 0.15788856148719788, + "num_tokens": 8167513.0, + "reward": 0.23750000912696123, + "reward_std": 0.2931488901376724, + "rewards/reward_financial_reasoning/mean": 0.23750000912696123, + "rewards/reward_financial_reasoning/std": 0.2931489050388336, + "step": 912, + "step_time": 50.308363749998534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 26.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.0, + "completions/max_terminated_length": 61.0, + "completions/mean_length": 26.375, + "completions/mean_terminated_length": 26.375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.40694568121104185, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.441307544708252, + "kl": 3.6447712928056717, + "learning_rate": 3.297872340425532e-06, + "loss": 0.23996052145957947, + "num_tokens": 8187695.0, + "reward": 0.19999999552965164, + "reward_std": 0.41815026104450226, + "rewards/reward_financial_reasoning/mean": 0.19999999552965164, + "rewards/reward_financial_reasoning/std": 0.41815026849508286, + "step": 914, + "step_time": 29.07546422100131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 15.375, + "completions/mean_terminated_length": 15.375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.40783615316117544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24823780357837677, + "kl": 4.964463084936142, + "learning_rate": 3.292924294903513e-06, + "loss": 0.17128558456897736, + "num_tokens": 8200701.0, + "reward": 0.20000000670552254, + "reward_std": 0.26726123690605164, + "rewards/reward_financial_reasoning/mean": 0.20000000670552254, + "rewards/reward_financial_reasoning/std": 0.26726123690605164, + "step": 916, + "step_time": 19.823334593500476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.5, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.5, + "completions/max_terminated_length": 19.0, + "completions/mean_length": 31.5, + "completions/mean_terminated_length": 16.383928775787354, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.40872662511130897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31193023920059204, + "kl": 2.5705791860818863, + "learning_rate": 3.287976249381495e-06, + "loss": 0.1012052446603775, + "num_tokens": 8218805.0, + "reward": 0.17500000074505806, + "reward_std": 0.29398736357688904, + "rewards/reward_financial_reasoning/mean": 0.17500000074505806, + "rewards/reward_financial_reasoning/std": 0.29398736357688904, + "step": 918, + "step_time": 50.4936487595005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 27.5, + "completions/max_terminated_length": 27.5, + "completions/mean_length": 23.5, + "completions/mean_terminated_length": 23.5, + "completions/min_length": 19.5, + "completions/min_terminated_length": 19.5, + "epoch": 0.40961709706144256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12017732858657837, + "kl": 2.4508478343486786, + "learning_rate": 3.283028203859476e-06, + "loss": 0.09826779365539551, + "num_tokens": 8239765.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 920, + "step_time": 19.770568174495565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 37.5, + "completions/max_terminated_length": 37.5, + "completions/mean_length": 17.8125, + "completions/mean_terminated_length": 17.8125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.41050756901157615, + "frac_reward_zero_std": 0.75, + "grad_norm": 147.41748046875, + "kl": 5.577771902084351, + "learning_rate": 3.2780801583374573e-06, + "loss": 0.3650239408016205, + "num_tokens": 8259626.0, + "reward": 0.4749999865889549, + "reward_std": 0.12416292726993561, + "rewards/reward_financial_reasoning/mean": 0.4749999865889549, + "rewards/reward_financial_reasoning/std": 0.12416292726993561, + "step": 922, + "step_time": 22.212593034008023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 15.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 15.125, + "completions/mean_terminated_length": 15.125, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.41139804096170973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19488407671451569, + "kl": 2.578184872865677, + "learning_rate": 3.2731321128154383e-06, + "loss": 0.10070281475782394, + "num_tokens": 8276492.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 924, + "step_time": 14.635534849996475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 35.8125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 141.0, + "completions/max_terminated_length": 23.5, + "completions/mean_length": 35.8125, + "completions/mean_terminated_length": 20.964285850524902, + "completions/min_length": 18.5, + "completions/min_terminated_length": 18.5, + "epoch": 0.41228851291184326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15457630157470703, + "kl": 2.4652615785598755, + "learning_rate": 3.2681840672934196e-06, + "loss": 0.09598159044981003, + "num_tokens": 8288481.0, + "reward": 0.20000000670552254, + "reward_std": 0.26726123690605164, + "rewards/reward_financial_reasoning/mean": 0.20000000670552254, + "rewards/reward_financial_reasoning/std": 0.26726123690605164, + "step": 926, + "step_time": 45.35208458950365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 30.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 61.5, + "completions/max_terminated_length": 61.5, + "completions/mean_length": 30.6875, + "completions/mean_terminated_length": 30.6875, + "completions/min_length": 17.5, + "completions/min_terminated_length": 17.5, + "epoch": 0.41317898486197685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5829043388366699, + "kl": 2.8620926439762115, + "learning_rate": 3.2632360217714006e-06, + "loss": 0.2158636450767517, + "num_tokens": 8306204.0, + "reward": -0.10000000335276127, + "reward_std": 0.21905138343572617, + "rewards/reward_financial_reasoning/mean": -0.10000000335276127, + "rewards/reward_financial_reasoning/std": 0.21905139833688736, + "step": 928, + "step_time": 27.723770248005167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.0, + "completions/max_terminated_length": 25.0, + "completions/mean_length": 16.8125, + "completions/mean_terminated_length": 16.8125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.41406945681211044, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.07053980231285095, + "kl": 3.5780192017555237, + "learning_rate": 3.258287976249382e-06, + "loss": 0.20760619640350342, + "num_tokens": 8319833.0, + "reward": 0.17500001192092896, + "reward_std": 0.472439780831337, + "rewards/reward_financial_reasoning/mean": 0.17500001192092896, + "rewards/reward_financial_reasoning/std": 0.47243979573249817, + "step": 930, + "step_time": 15.179649859499477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.3125, + "completions/clipped_ratio": 0.125, + "completions/max_length": 133.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 41.3125, + "completions/mean_terminated_length": 11.229166984558105, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.41495992876224397, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9923263788223267, + "kl": 4.27918504178524, + "learning_rate": 3.253339930727363e-06, + "loss": 0.2304915338754654, + "num_tokens": 8340382.0, + "reward": 0.46249998826533556, + "reward_std": 0.159518264234066, + "rewards/reward_financial_reasoning/mean": 0.46249998826533556, + "rewards/reward_financial_reasoning/std": 0.159518264234066, + "step": 932, + "step_time": 50.51667185199767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.75, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.5, + "completions/max_terminated_length": 34.5, + "completions/mean_length": 19.75, + "completions/mean_terminated_length": 19.75, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.41585040071237755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15250875055789948, + "kl": 2.793649807572365, + "learning_rate": 3.2483918852053443e-06, + "loss": 0.09913206100463867, + "num_tokens": 8359162.0, + "reward": 0.1250000074505806, + "reward_std": 0.34743961691856384, + "rewards/reward_financial_reasoning/mean": 0.1250000074505806, + "rewards/reward_financial_reasoning/std": 0.34743963181972504, + "step": 934, + "step_time": 20.64623363999999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 51.3125, + "completions/clipped_ratio": 0.125, + "completions/max_length": 140.5, + "completions/max_terminated_length": 34.5, + "completions/mean_length": 51.3125, + "completions/mean_terminated_length": 21.75, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.41674087266251114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18754097819328308, + "kl": 1.8101801723241806, + "learning_rate": 3.2434438396833252e-06, + "loss": 0.05962216854095459, + "num_tokens": 8383911.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 936, + "step_time": 55.66111214999546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 33.875, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 155.0, + "completions/max_terminated_length": 36.0, + "completions/mean_length": 33.875, + "completions/mean_terminated_length": 18.48214292526245, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.41763134461264473, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.2415056228637695, + "kl": 2.8781251162290573, + "learning_rate": 3.2384957941613066e-06, + "loss": 0.0792328417301178, + "num_tokens": 8398349.0, + "reward": 0.4499999862164259, + "reward_std": 0.14603425562381744, + "rewards/reward_financial_reasoning/mean": 0.4499999862164259, + "rewards/reward_financial_reasoning/std": 0.14603426307439804, + "step": 938, + "step_time": 51.50570740849798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 26.5, + "completions/max_terminated_length": 26.5, + "completions/mean_length": 20.25, + "completions/mean_terminated_length": 20.25, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.41852181656277826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08914017677307129, + "kl": 5.042290985584259, + "learning_rate": 3.233547748639288e-06, + "loss": 0.17624709010124207, + "num_tokens": 8416193.0, + "reward": -0.15000000596046448, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -0.15000000596046448, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 940, + "step_time": 17.541959170001064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 55.4375, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.0, + "completions/max_terminated_length": 132.5, + "completions/mean_length": 55.4375, + "completions/mean_terminated_length": 43.892860412597656, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.41941228851291185, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.0707364082336426, + "kl": 1.8768098652362823, + "learning_rate": 3.228599703117269e-06, + "loss": 0.08203748613595963, + "num_tokens": 8438224.0, + "reward": -0.01249999925494194, + "reward_std": 0.18850919604301453, + "rewards/reward_financial_reasoning/mean": -0.01249999925494194, + "rewards/reward_financial_reasoning/std": 0.18850919604301453, + "step": 942, + "step_time": 52.522713671496604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.4375, + "completions/clipped_ratio": 0.125, + "completions/max_length": 138.0, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 46.4375, + "completions/mean_terminated_length": 16.6875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.42030276046304543, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.3209104537963867, + "kl": 2.6846156250685453, + "learning_rate": 3.2236516575952503e-06, + "loss": 0.13027328252792358, + "num_tokens": 8459887.0, + "reward": 0.32500001043081284, + "reward_std": 0.38347896933555603, + "rewards/reward_financial_reasoning/mean": 0.32500001043081284, + "rewards/reward_financial_reasoning/std": 0.3834789991378784, + "step": 944, + "step_time": 52.64768808199733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.6875, + "completions/mean_terminated_length": 16.6875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.42119323241317896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1603250652551651, + "kl": 2.955336630344391, + "learning_rate": 3.2187036120732313e-06, + "loss": 0.1090928167104721, + "num_tokens": 8482226.0, + "reward": 0.1250000074505806, + "reward_std": 0.34743961691856384, + "rewards/reward_financial_reasoning/mean": 0.1250000074505806, + "rewards/reward_financial_reasoning/std": 0.34743963181972504, + "step": 946, + "step_time": 19.271809785506775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.6875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.5, + "completions/max_terminated_length": 29.5, + "completions/mean_length": 18.6875, + "completions/mean_terminated_length": 18.6875, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.42208370436331255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6349166631698608, + "kl": 2.067967712879181, + "learning_rate": 3.2137555665512126e-06, + "loss": 0.07463287562131882, + "num_tokens": 8501501.0, + "reward": 0.30000001192092896, + "reward_std": 0.5345224589109421, + "rewards/reward_financial_reasoning/mean": 0.30000001192092896, + "rewards/reward_financial_reasoning/std": 0.5345224738121033, + "step": 948, + "step_time": 19.316131935000158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 39.5, + "completions/max_terminated_length": 39.5, + "completions/mean_length": 23.125, + "completions/mean_terminated_length": 23.125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.42297417631344614, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.08936484903097153, + "kl": 2.352201282978058, + "learning_rate": 3.2088075210291936e-06, + "loss": 0.05001607537269592, + "num_tokens": 8516351.0, + "reward": 0.5125000029802322, + "reward_std": 0.3090885281562805, + "rewards/reward_financial_reasoning/mean": 0.5125000029802322, + "rewards/reward_financial_reasoning/std": 0.3090885281562805, + "step": 950, + "step_time": 19.179083563998574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 69.5, + "completions/clipped_ratio": 0.25, + "completions/max_length": 256.0, + "completions/max_terminated_length": 11.0, + "completions/mean_length": 69.5, + "completions/mean_terminated_length": 7.428571701049805, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.4238646482635797, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.2313220351934433, + "kl": 4.225497528910637, + "learning_rate": 3.203859475507175e-06, + "loss": 0.42357900738716125, + "num_tokens": 8535511.0, + "reward": 0.42499999701976776, + "reward_std": 0.5846030414104462, + "rewards/reward_financial_reasoning/mean": 0.42499999701976776, + "rewards/reward_financial_reasoning/std": 0.5846030116081238, + "step": 952, + "step_time": 83.1467112555074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 17.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 31.5, + "completions/max_terminated_length": 31.5, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.42475512021371326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17937541007995605, + "kl": 3.103740006685257, + "learning_rate": 3.198911429985156e-06, + "loss": 0.1278012990951538, + "num_tokens": 8545871.0, + "reward": 0.2750000059604645, + "reward_std": 0.5612486004829407, + "rewards/reward_financial_reasoning/mean": 0.2750000059604645, + "rewards/reward_financial_reasoning/std": 0.5612486004829407, + "step": 954, + "step_time": 14.637012897994282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 6.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9.5, + "completions/max_terminated_length": 9.5, + "completions/mean_length": 6.0, + "completions/mean_terminated_length": 6.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.42564559216384684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2365179806947708, + "kl": 6.207745045423508, + "learning_rate": 3.1939633844631373e-06, + "loss": 0.22508688271045685, + "num_tokens": 8560383.0, + "reward": 0.5249999910593033, + "reward_std": 0.40089183300733566, + "rewards/reward_financial_reasoning/mean": 0.5249999910593033, + "rewards/reward_financial_reasoning/std": 0.40089183300733566, + "step": 956, + "step_time": 10.830135092503042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 41.5625, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 138.5, + "completions/max_terminated_length": 116.5, + "completions/mean_length": 41.5625, + "completions/mean_terminated_length": 28.312501907348633, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.42653606411398043, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7334434390068054, + "kl": 2.0300208553671837, + "learning_rate": 3.1890153389411182e-06, + "loss": 0.20082223415374756, + "num_tokens": 8573904.0, + "reward": 0.0, + "reward_std": 0.35675284266471863, + "rewards/reward_financial_reasoning/mean": 0.0, + "rewards/reward_financial_reasoning/std": 0.3567528575658798, + "step": 958, + "step_time": 47.42274022550191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 20.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 32.5, + "completions/max_terminated_length": 32.5, + "completions/mean_length": 20.5, + "completions/mean_terminated_length": 20.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.42742653606411396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19361354410648346, + "kl": 2.600453108549118, + "learning_rate": 3.1840672934190996e-06, + "loss": 0.09075171500444412, + "num_tokens": 8595496.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 960, + "step_time": 21.33384811499127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 16.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 45.5, + "completions/max_terminated_length": 45.5, + "completions/mean_length": 16.8125, + "completions/mean_terminated_length": 16.8125, + "completions/min_length": 11.5, + "completions/min_terminated_length": 11.5, + "epoch": 0.42831700801424755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20128197968006134, + "kl": 2.412764996290207, + "learning_rate": 3.179119247897081e-06, + "loss": 0.0916946604847908, + "num_tokens": 8610477.0, + "reward": 0.02500000037252903, + "reward_std": 0.08017837256193161, + "rewards/reward_financial_reasoning/mean": 0.02500000037252903, + "rewards/reward_financial_reasoning/std": 0.08017837256193161, + "step": 962, + "step_time": 20.371279284001503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 40.375, + "completions/clipped_ratio": 0.125, + "completions/max_length": 138.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 40.375, + "completions/mean_terminated_length": 9.270833492279053, + "completions/min_length": 5.5, + "completions/min_terminated_length": 5.5, + "epoch": 0.42920747996438113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11826738715171814, + "kl": 3.3262559920549393, + "learning_rate": 3.174171202375062e-06, + "loss": 0.11317823082208633, + "num_tokens": 8630323.0, + "reward": 0.025000005960464478, + "reward_std": 0.24053511023521423, + "rewards/reward_financial_reasoning/mean": 0.025000005960464478, + "rewards/reward_financial_reasoning/std": 0.24053512513637543, + "step": 964, + "step_time": 49.96986727849435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 36.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.5, + "completions/max_terminated_length": 135.5, + "completions/mean_length": 36.8125, + "completions/mean_terminated_length": 36.8125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.4300979519145147, + "frac_reward_zero_std": 0.5, + "grad_norm": 5.354856967926025, + "kl": 2.3516476303339005, + "learning_rate": 3.1692231568530433e-06, + "loss": 0.2586827874183655, + "num_tokens": 8647824.0, + "reward": 0.1875000074505806, + "reward_std": 0.27381163090467453, + "rewards/reward_financial_reasoning/mean": 0.1875000074505806, + "rewards/reward_financial_reasoning/std": 0.2738116607069969, + "step": 966, + "step_time": 46.45492545999514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 27.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 29.5, + "completions/max_terminated_length": 29.5, + "completions/mean_length": 27.3125, + "completions/mean_terminated_length": 27.3125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.43098842386464825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06150537729263306, + "kl": 1.8199052214622498, + "learning_rate": 3.1642751113310242e-06, + "loss": 0.07260263711214066, + "num_tokens": 8667645.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/reward_financial_reasoning/mean": 0.10000000149011612, + "rewards/reward_financial_reasoning/std": 0.0, + "step": 968, + "step_time": 19.451285248505883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 76.125, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 256.0, + "completions/max_terminated_length": 124.5, + "completions/mean_length": 76.125, + "completions/mean_terminated_length": 35.178571701049805, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.43187889581478184, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.2838754653930664, + "kl": 1.8590649515390396, + "learning_rate": 3.159327065809006e-06, + "loss": 0.2001737803220749, + "num_tokens": 8683503.0, + "reward": 0.32500001043081284, + "reward_std": 0.38347896933555603, + "rewards/reward_financial_reasoning/mean": 0.32500001043081284, + "rewards/reward_financial_reasoning/std": 0.3834789991378784, + "step": 970, + "step_time": 78.57448822999868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 45.5625, + "completions/clipped_ratio": 0.125, + "completions/max_length": 154.5, + "completions/max_terminated_length": 32.0, + "completions/mean_length": 45.5625, + "completions/mean_terminated_length": 14.895833492279053, + "completions/min_length": 7.5, + "completions/min_terminated_length": 7.5, + "epoch": 0.4327693677649154, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.10290610790252686, + "kl": 3.8507900685071945, + "learning_rate": 3.1543790202869866e-06, + "loss": 0.2706332206726074, + "num_tokens": 8692640.0, + "reward": 0.25000000558793545, + "reward_std": 0.31163340061903, + "rewards/reward_financial_reasoning/mean": 0.25000000558793545, + "rewards/reward_financial_reasoning/std": 0.31163340061903, + "step": 972, + "step_time": 46.23696361999828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 19.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 25.5, + "completions/max_terminated_length": 25.5, + "completions/mean_length": 19.5625, + "completions/mean_terminated_length": 19.5625, + "completions/min_length": 13.5, + "completions/min_terminated_length": 13.5, + "epoch": 0.43365983971504896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1872958242893219, + "kl": 2.4179421216249466, + "learning_rate": 3.1494309747649684e-06, + "loss": 0.09821489453315735, + "num_tokens": 8713617.0, + "reward": 0.2750000096857548, + "reward_std": 0.18708287179470062, + "rewards/reward_financial_reasoning/mean": 0.2750000096857548, + "rewards/reward_financial_reasoning/std": 0.18708288669586182, + "step": 974, + "step_time": 18.739151772500918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 12.0625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 23.0, + "completions/max_terminated_length": 23.0, + "completions/mean_length": 12.0625, + "completions/mean_terminated_length": 12.0625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.43455031166518254, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4092589616775513, + "kl": 3.115272670984268, + "learning_rate": 3.144482929242949e-06, + "loss": 0.1181776374578476, + "num_tokens": 8734858.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 976, + "step_time": 18.696438336995925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 18.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.5, + "completions/max_terminated_length": 22.5, + "completions/mean_length": 18.8125, + "completions/mean_terminated_length": 18.8125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.43544078361531613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09314737468957901, + "kl": 1.7916912287473679, + "learning_rate": 3.1395348837209307e-06, + "loss": 0.06952106952667236, + "num_tokens": 8750415.0, + "reward": 0.1250000074505806, + "reward_std": 0.34743961691856384, + "rewards/reward_financial_reasoning/mean": 0.1250000074505806, + "rewards/reward_financial_reasoning/std": 0.34743963181972504, + "step": 978, + "step_time": 14.855361223002546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 22.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 49.5, + "completions/max_terminated_length": 49.5, + "completions/mean_length": 22.0, + "completions/mean_terminated_length": 22.0, + "completions/min_length": 10.5, + "completions/min_terminated_length": 10.5, + "epoch": 0.4363312555654497, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.07393768429756165, + "kl": 3.0034860372543335, + "learning_rate": 3.134586838198912e-06, + "loss": 0.12021099776029587, + "num_tokens": 8767855.0, + "reward": 0.46250002086162567, + "reward_std": 0.39018382132053375, + "rewards/reward_financial_reasoning/mean": 0.46250002086162567, + "rewards/reward_financial_reasoning/std": 0.39018386602401733, + "step": 980, + "step_time": 23.458823763998225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.8125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 24.5, + "completions/max_terminated_length": 24.5, + "completions/mean_length": 14.8125, + "completions/mean_terminated_length": 14.8125, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.43722172751558325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14612756669521332, + "kl": 3.3168026506900787, + "learning_rate": 3.129638792676893e-06, + "loss": 0.13220839202404022, + "num_tokens": 8787948.0, + "reward": -0.07499999925494194, + "reward_std": 0.026726126670837402, + "rewards/reward_financial_reasoning/mean": -0.07499999925494194, + "rewards/reward_financial_reasoning/std": 0.026726126670837402, + "step": 982, + "step_time": 18.28132488799747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 47.125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.5, + "completions/max_terminated_length": 138.5, + "completions/mean_length": 47.125, + "completions/mean_terminated_length": 47.125, + "completions/min_length": 9.5, + "completions/min_terminated_length": 9.5, + "epoch": 0.43811219946571683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17604397237300873, + "kl": 11.56613752245903, + "learning_rate": 3.1246907471548744e-06, + "loss": 0.22730720043182373, + "num_tokens": 8807486.0, + "reward": 0.2000000085681677, + "reward_std": 0.26726124435663223, + "rewards/reward_financial_reasoning/mean": 0.2000000085681677, + "rewards/reward_financial_reasoning/std": 0.2672612592577934, + "step": 984, + "step_time": 49.57627448099811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 28.5625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.5, + "completions/max_terminated_length": 89.5, + "completions/mean_length": 28.5625, + "completions/mean_terminated_length": 28.5625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.4390026714158504, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.47641658782959, + "kl": 2.2521740794181824, + "learning_rate": 3.1197427016328553e-06, + "loss": 0.14822399616241455, + "num_tokens": 8828775.0, + "reward": 0.07499999925494194, + "reward_std": 0.0707106813788414, + "rewards/reward_financial_reasoning/mean": 0.07499999925494194, + "rewards/reward_financial_reasoning/std": 0.0707106813788414, + "step": 986, + "step_time": 37.008812667496386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 14.3125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 14.3125, + "completions/mean_terminated_length": 14.3125, + "completions/min_length": 6.5, + "completions/min_terminated_length": 6.5, + "epoch": 0.43989314336598395, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.25839534401893616, + "kl": 6.1503177136182785, + "learning_rate": 3.1147946561108367e-06, + "loss": 0.19753281772136688, + "num_tokens": 8850516.0, + "reward": 0.5750000178813934, + "reward_std": 0.49124836921691895, + "rewards/reward_financial_reasoning/mean": 0.5750000178813934, + "rewards/reward_financial_reasoning/std": 0.4912484139204025, + "step": 988, + "step_time": 18.673066382496472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 13.625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.5, + "completions/max_terminated_length": 20.5, + "completions/mean_length": 13.625, + "completions/mean_terminated_length": 13.625, + "completions/min_length": 8.5, + "completions/min_terminated_length": 8.5, + "epoch": 0.44078361531611754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7419772148132324, + "kl": 3.7723660320043564, + "learning_rate": 3.1098466105888177e-06, + "loss": 0.12690135836601257, + "num_tokens": 8871710.0, + "reward": 0.32500000670552254, + "reward_std": 0.24053511023521423, + "rewards/reward_financial_reasoning/mean": 0.32500000670552254, + "rewards/reward_financial_reasoning/std": 0.24053512513637543, + "step": 990, + "step_time": 17.83083552050084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 23.25, + "completions/clipped_ratio": 0.0, + "completions/max_length": 34.5, + "completions/max_terminated_length": 34.5, + "completions/mean_length": 23.25, + "completions/mean_terminated_length": 23.25, + "completions/min_length": 12.5, + "completions/min_terminated_length": 12.5, + "epoch": 0.4416740872662511, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1373910903930664, + "kl": 2.4735867977142334, + "learning_rate": 3.104898565066799e-06, + "loss": 0.09462883323431015, + "num_tokens": 8890818.0, + "reward": -0.05000000074505806, + "reward_std": 0.16035674512386322, + "rewards/reward_financial_reasoning/mean": -0.05000000074505806, + "rewards/reward_financial_reasoning/std": 0.16035674512386322, + "step": 992, + "step_time": 20.594509614002163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 37.5, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.5, + "completions/max_terminated_length": 102.5, + "completions/mean_length": 37.5, + "completions/mean_terminated_length": 37.5, + "completions/min_length": 17.5, + "completions/min_terminated_length": 17.5, + "epoch": 0.44256455921638466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07724303007125854, + "kl": 2.4766226708889008, + "learning_rate": 3.09995051954478e-06, + "loss": 0.09341251105070114, + "num_tokens": 8907066.0, + "reward": -3.725290298461914e-09, + "reward_std": 0.10690449923276901, + "rewards/reward_financial_reasoning/mean": -3.725290298461914e-09, + "rewards/reward_financial_reasoning/std": 0.10690450668334961, + "step": 994, + "step_time": 36.1086978409985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 46.5625, + "completions/clipped_ratio": 0.125, + "completions/max_length": 142.5, + "completions/max_terminated_length": 24.0, + "completions/mean_length": 46.5625, + "completions/mean_terminated_length": 15.666666984558105, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.44345503116651824, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3874448537826538, + "kl": 2.218121826648712, + "learning_rate": 3.0950024740227614e-06, + "loss": 0.17174197733402252, + "num_tokens": 8922219.0, + "reward": 0.32500000670552254, + "reward_std": 0.2314550280570984, + "rewards/reward_financial_reasoning/mean": 0.32500000670552254, + "rewards/reward_financial_reasoning/std": 0.2314550280570984, + "step": 996, + "step_time": 45.69451289299468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 31.3125, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 136.5, + "completions/max_terminated_length": 28.5, + "completions/mean_length": 31.3125, + "completions/mean_terminated_length": 16.54464340209961, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.44434550311665183, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.13446766138076782, + "kl": 3.6593779623508453, + "learning_rate": 3.0900544285007423e-06, + "loss": 0.1636892408132553, + "num_tokens": 8938832.0, + "reward": 0.22500000894069672, + "reward_std": 0.434930756688118, + "rewards/reward_financial_reasoning/mean": 0.22500000894069672, + "rewards/reward_financial_reasoning/std": 0.4349307715892792, + "step": 998, + "step_time": 47.712723782995454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 56.625, + "completions/clipped_ratio": 0.125, + "completions/max_length": 150.0, + "completions/max_terminated_length": 50.5, + "completions/mean_length": 56.625, + "completions/mean_terminated_length": 26.70833396911621, + "completions/min_length": 15.5, + "completions/min_terminated_length": 15.5, + "epoch": 0.4452359750667854, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.231067657470703, + "kl": 1.4210374727845192, + "learning_rate": 3.0851063829787237e-06, + "loss": 0.08275322616100311, + "num_tokens": 8960930.0, + "reward": 0.23750001192092896, + "reward_std": 0.5119454711675644, + "rewards/reward_financial_reasoning/mean": 0.23750001192092896, + "rewards/reward_financial_reasoning/std": 0.5119454860687256, + "step": 1000, + "step_time": 54.88188034250561 + } + ], + "logging_steps": 2, + "max_steps": 2246, + "num_input_tokens_seen": 8960930, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}