diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,104034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.46194710705624203, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 119.46875, + "completions/mean_terminated_length": 119.46875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.00011548677676406051, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 8e-06, + "loss": 0.0, + "num_tokens": 19759.0, + "reward": 3.9780216217041016, + "reward_std": 0.12432922422885895, + "rewards/reward_fn/mean": 3.9780216217041016, + "rewards/reward_fn/std": 0.12432923913002014, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 202.4375, + "completions/mean_terminated_length": 202.4375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.00023097355352812103, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.008802666554402094, + "learning_rate": 7.998e-06, + "loss": 0.0004, + "num_tokens": 51197.0, + "reward": 2.806682586669922, + "reward_std": 0.234120711684227, + "rewards/reward_fn/mean": 2.806682586669922, + "rewards/reward_fn/std": 0.2341206967830658, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 83.0625, + "completions/mean_terminated_length": 83.0625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.00034646033029218156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.019857537306961603, + "learning_rate": 7.996e-06, + "loss": 0.0008, + "num_tokens": 67327.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 187.03125, + "completions/mean_terminated_length": 187.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.00046194710705624206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.016124493195093237, + "learning_rate": 7.994e-06, + "loss": 0.0006, + "num_tokens": 96704.0, + "reward": 3.6763052940368652, + "reward_std": 0.35056957602500916, + "rewards/reward_fn/mean": 3.6763052940368652, + "rewards/reward_fn/std": 0.35056957602500916, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 49.6875, + "completions/mean_terminated_length": 49.6875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.0005774338838203026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.008157416436006315, + "learning_rate": 7.991999999999999e-06, + "loss": 0.0003, + "num_tokens": 112918.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 152.65625, + "completions/mean_terminated_length": 152.65625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.0006929206605843631, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.01975844756816514, + "learning_rate": 7.99e-06, + "loss": 0.0008, + "num_tokens": 132939.0, + "reward": 3.9280753135681152, + "reward_std": 0.40686798095703125, + "rewards/reward_fn/mean": 3.9280753135681152, + "rewards/reward_fn/std": 0.40686795115470886, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 85.15625, + "completions/mean_terminated_length": 85.15625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.0008084074373484236, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "kl": 0.028681755080469884, + "learning_rate": 7.988e-06, + "loss": 0.0011, + "num_tokens": 153264.0, + "reward": 2.9875006675720215, + "reward_std": 0.03269573301076889, + "rewards/reward_fn/mean": 2.9875006675720215, + "rewards/reward_fn/std": 0.03269574046134949, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 190.71875, + "completions/mean_terminated_length": 190.71875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.0009238942141124841, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.012461876023735385, + "learning_rate": 7.986e-06, + "loss": 0.0005, + "num_tokens": 176455.0, + "reward": 3.39101505279541, + "reward_std": 1.0070130825042725, + "rewards/reward_fn/mean": 3.39101505279541, + "rewards/reward_fn/std": 1.0070130825042725, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 123.78125, + "completions/mean_terminated_length": 123.78125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.0010393809908765446, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.022061517724068835, + "learning_rate": 7.984e-06, + "loss": 0.0009, + "num_tokens": 201440.0, + "reward": 3.799323320388794, + "reward_std": 0.39548802375793457, + "rewards/reward_fn/mean": 3.799323320388794, + "rewards/reward_fn/std": 0.3954879641532898, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 235.625, + "completions/mean_terminated_length": 235.625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.0011548677676406051, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.015987483493518084, + "learning_rate": 7.981999999999999e-06, + "loss": 0.0006, + "num_tokens": 232916.0, + "reward": 3.7794697284698486, + "reward_std": 0.6966649889945984, + "rewards/reward_fn/mean": 3.7794697284698486, + "rewards/reward_fn/std": 0.6966649293899536, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 223.03125, + "completions/mean_terminated_length": 223.03125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.0012703545444046657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1806640625, + "kl": 0.02722701456514187, + "learning_rate": 7.98e-06, + "loss": 0.0011, + "num_tokens": 254325.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 126.3125, + "completions/mean_terminated_length": 126.3125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.0013858413211687262, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.232421875, + "kl": 0.03448950970778242, + "learning_rate": 7.977999999999999e-06, + "loss": 0.0014, + "num_tokens": 270175.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 323.84375, + "completions/mean_terminated_length": 323.84375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.0015013280979327868, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.015641329751815647, + "learning_rate": 7.976e-06, + "loss": 0.0006, + "num_tokens": 304250.0, + "reward": 3.6373510360717773, + "reward_std": 0.8205180764198303, + "rewards/reward_fn/mean": 3.6373510360717773, + "rewards/reward_fn/std": 0.8205181360244751, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.0, + "completions/max_terminated_length": 100.0, + "completions/mean_length": 63.84375, + "completions/mean_terminated_length": 63.84375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.0016168148746968471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.162109375, + "kl": 0.02290871791774407, + "learning_rate": 7.974e-06, + "loss": 0.0009, + "num_tokens": 322069.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 91.25, + "completions/mean_terminated_length": 91.25, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.0017323016514609077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2099609375, + "kl": 0.03030280352686532, + "learning_rate": 7.972e-06, + "loss": 0.0012, + "num_tokens": 332733.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 262.3125, + "completions/mean_terminated_length": 262.3125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.0018477884282249682, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.013212046920671128, + "learning_rate": 7.97e-06, + "loss": 0.0005, + "num_tokens": 369511.0, + "reward": 3.242628574371338, + "reward_std": 0.9185323119163513, + "rewards/reward_fn/mean": 3.242628574371338, + "rewards/reward_fn/std": 0.9185322523117065, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 104.96875, + "completions/mean_terminated_length": 104.96875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.001963275204989029, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.15625, + "kl": 0.013855469762347639, + "learning_rate": 7.967999999999999e-06, + "loss": 0.0006, + "num_tokens": 396806.0, + "reward": 3.785733222961426, + "reward_std": 0.5066823959350586, + "rewards/reward_fn/mean": 3.785733222961426, + "rewards/reward_fn/std": 0.5066823363304138, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 84.125, + "completions/mean_terminated_length": 84.125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.002078761981753089, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.02629521765629761, + "learning_rate": 7.966e-06, + "loss": 0.0011, + "num_tokens": 416618.0, + "reward": 3.965245246887207, + "reward_std": 0.19660283625125885, + "rewards/reward_fn/mean": 3.965245246887207, + "rewards/reward_fn/std": 0.19660283625125885, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 70.40625, + "completions/mean_terminated_length": 70.40625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.00219424875851715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.158203125, + "kl": 0.018844263089704327, + "learning_rate": 7.964e-06, + "loss": 0.0008, + "num_tokens": 431863.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 216.15625, + "completions/mean_terminated_length": 216.15625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.0023097355352812102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.018434493278618902, + "learning_rate": 7.962e-06, + "loss": 0.0007, + "num_tokens": 458748.0, + "reward": 3.8236019611358643, + "reward_std": 0.5905573964118958, + "rewards/reward_fn/mean": 3.8236019611358643, + "rewards/reward_fn/std": 0.5905573964118958, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 224.75, + "completions/mean_terminated_length": 224.75, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.002425222312045271, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.016903131443541497, + "learning_rate": 7.96e-06, + "loss": 0.0007, + "num_tokens": 484756.0, + "reward": 3.617159843444824, + "reward_std": 0.9476330280303955, + "rewards/reward_fn/mean": 3.617159843444824, + "rewards/reward_fn/std": 0.9476329684257507, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 212.375, + "completions/mean_terminated_length": 212.375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.0025407090888093314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.021785891556646675, + "learning_rate": 7.957999999999999e-06, + "loss": 0.0009, + "num_tokens": 503584.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 206.75, + "completions/mean_terminated_length": 206.75, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.0026561958655733917, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.019270089163910598, + "learning_rate": 7.956e-06, + "loss": 0.0008, + "num_tokens": 529240.0, + "reward": 3.5072920322418213, + "reward_std": 0.42814141511917114, + "rewards/reward_fn/mean": 3.5072920322418213, + "rewards/reward_fn/std": 0.42814138531684875, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 152.0625, + "completions/mean_terminated_length": 152.0625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.0027716826423374525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.012605793381226249, + "learning_rate": 7.953999999999999e-06, + "loss": 0.0005, + "num_tokens": 555258.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 48.5625, + "completions/mean_terminated_length": 48.5625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.002887169419101513, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.65625, + "kl": 0.022014679401763715, + "learning_rate": 7.952e-06, + "loss": 0.0009, + "num_tokens": 582796.0, + "reward": 3.125, + "reward_std": 1.6800537109375, + "rewards/reward_fn/mean": 3.125, + "rewards/reward_fn/std": 1.6800537109375, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 196.5, + "completions/mean_terminated_length": 196.5, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.0030026561958655736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.016049556928919628, + "learning_rate": 7.95e-06, + "loss": 0.0006, + "num_tokens": 601916.0, + "reward": 3.874605178833008, + "reward_std": 0.33823055028915405, + "rewards/reward_fn/mean": 3.874605178833008, + "rewards/reward_fn/std": 0.33823058009147644, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1219.0, + "completions/mean_length": 552.96875, + "completions/mean_terminated_length": 504.7419128417969, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.003118142972629634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.00742558111960534, + "learning_rate": 7.948e-06, + "loss": 0.0003, + "num_tokens": 634747.0, + "reward": 3.228135585784912, + "reward_std": 1.1602466106414795, + "rewards/reward_fn/mean": 3.228135585784912, + "rewards/reward_fn/std": 1.1602466106414795, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 144.59375, + "completions/mean_terminated_length": 144.59375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.0032336297493936943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.016436098390840925, + "learning_rate": 7.946e-06, + "loss": 0.0007, + "num_tokens": 672526.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 434.34375, + "completions/mean_terminated_length": 434.34375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.003349116526157755, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.008521520518115722, + "learning_rate": 7.943999999999999e-06, + "loss": 0.0003, + "num_tokens": 701241.0, + "reward": 3.777053117752075, + "reward_std": 0.7043256759643555, + "rewards/reward_fn/mean": 3.777053117752075, + "rewards/reward_fn/std": 0.7043256759643555, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 190.125, + "completions/mean_terminated_length": 190.125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.0034646033029218154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.019611396099207923, + "learning_rate": 7.942e-06, + "loss": 0.0008, + "num_tokens": 719933.0, + "reward": 1.9827988147735596, + "reward_std": 0.5363081097602844, + "rewards/reward_fn/mean": 1.9827988147735596, + "rewards/reward_fn/std": 0.5363081097602844, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 149.40625, + "completions/mean_terminated_length": 149.40625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.003580090079685876, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.014830526517471299, + "learning_rate": 7.94e-06, + "loss": 0.0006, + "num_tokens": 748458.0, + "reward": 3.4160242080688477, + "reward_std": 0.15208449959754944, + "rewards/reward_fn/mean": 3.4160242080688477, + "rewards/reward_fn/std": 0.15208451449871063, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 46.875, + "completions/mean_terminated_length": 46.875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.0036955768564499365, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.4375, + "kl": 0.027611331461230293, + "learning_rate": 7.938e-06, + "loss": 0.0011, + "num_tokens": 772358.0, + "reward": 3.375, + "reward_std": 1.4756081104278564, + "rewards/reward_fn/mean": 3.375, + "rewards/reward_fn/std": 1.4756081104278564, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 235.0625, + "completions/mean_terminated_length": 235.0625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.003811063633213997, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.013234368001576513, + "learning_rate": 7.936e-06, + "loss": 0.0005, + "num_tokens": 801128.0, + "reward": 3.58689284324646, + "reward_std": 0.6951528191566467, + "rewards/reward_fn/mean": 3.58689284324646, + "rewards/reward_fn/std": 0.695152759552002, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 326.25, + "completions/mean_terminated_length": 326.25, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.003926550409978058, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.010720414255047217, + "learning_rate": 7.934e-06, + "loss": 0.0004, + "num_tokens": 826640.0, + "reward": 3.929694175720215, + "reward_std": 0.3977096378803253, + "rewards/reward_fn/mean": 3.929694175720215, + "rewards/reward_fn/std": 0.39770957827568054, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 357.0625, + "completions/mean_terminated_length": 357.0625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.004042037186742118, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.012957294471561909, + "learning_rate": 7.932e-06, + "loss": 0.0005, + "num_tokens": 861394.0, + "reward": 3.0172581672668457, + "reward_std": 0.45352548360824585, + "rewards/reward_fn/mean": 3.0172581672668457, + "rewards/reward_fn/std": 0.45352548360824585, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 92.625, + "completions/mean_terminated_length": 92.625, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.004157523963506178, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.0186726140382234, + "learning_rate": 7.929999999999999e-06, + "loss": 0.0007, + "num_tokens": 877894.0, + "reward": 3.9285221099853516, + "reward_std": 0.40434005856513977, + "rewards/reward_fn/mean": 3.9285221099853516, + "rewards/reward_fn/std": 0.40434008836746216, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 180.125, + "completions/mean_terminated_length": 180.125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.004273010740270239, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.018501615471905097, + "learning_rate": 7.928e-06, + "loss": 0.0007, + "num_tokens": 899498.0, + "reward": 3.929746389389038, + "reward_std": 0.3974144756793976, + "rewards/reward_fn/mean": 3.929746389389038, + "rewards/reward_fn/std": 0.3974144756793976, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 203.5625, + "completions/mean_terminated_length": 203.5625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.0043884975170343, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.02284541963308584, + "learning_rate": 7.926e-06, + "loss": 0.0009, + "num_tokens": 918428.0, + "reward": 3.928581714630127, + "reward_std": 0.4040035009384155, + "rewards/reward_fn/mean": 3.928581714630127, + "rewards/reward_fn/std": 0.4040035009384155, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 959.0, + "completions/mean_length": 519.84375, + "completions/mean_terminated_length": 470.5483703613281, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.00450398429379836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.01311383857682813, + "learning_rate": 7.924e-06, + "loss": 0.0005, + "num_tokens": 947159.0, + "reward": 3.75, + "reward_std": 0.9837387204170227, + "rewards/reward_fn/mean": 3.75, + "rewards/reward_fn/std": 0.9837387204170227, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1042.0, + "completions/max_terminated_length": 1042.0, + "completions/mean_length": 349.59375, + "completions/mean_terminated_length": 349.59375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.0046194710705624205, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.01588868416729383, + "learning_rate": 7.922e-06, + "loss": 0.0006, + "num_tokens": 981866.0, + "reward": 3.573197841644287, + "reward_std": 0.9533390402793884, + "rewards/reward_fn/mean": 3.573197841644287, + "rewards/reward_fn/std": 0.9533389806747437, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 214.5625, + "completions/mean_terminated_length": 214.5625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.004734957847326481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.019768436206504703, + "learning_rate": 7.92e-06, + "loss": 0.0008, + "num_tokens": 1007196.0, + "reward": 3.8132715225219727, + "reward_std": 0.3954962491989136, + "rewards/reward_fn/mean": 3.8132715225219727, + "rewards/reward_fn/std": 0.39549627900123596, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 104.1875, + "completions/mean_terminated_length": 104.1875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.004850444624090542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.02154825442994479, + "learning_rate": 7.918e-06, + "loss": 0.0009, + "num_tokens": 1031938.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 309.34375, + "completions/mean_terminated_length": 253.258056640625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.004965931400854602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7578125, + "kl": 0.01941796917526517, + "learning_rate": 7.916e-06, + "loss": 0.0008, + "num_tokens": 1054285.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 316.03125, + "completions/mean_terminated_length": 316.03125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.005081418177618663, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.015658196323784068, + "learning_rate": 7.913999999999999e-06, + "loss": 0.0006, + "num_tokens": 1082062.0, + "reward": 3.3944404125213623, + "reward_std": 1.0158030986785889, + "rewards/reward_fn/mean": 3.3944404125213623, + "rewards/reward_fn/std": 1.0158030986785889, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 113.46875, + "completions/mean_terminated_length": 113.46875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.0051969049543827235, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.019476017521810718, + "learning_rate": 7.912e-06, + "loss": 0.0008, + "num_tokens": 1111293.0, + "reward": 3.1329238414764404, + "reward_std": 0.17433500289916992, + "rewards/reward_fn/mean": 3.1329238414764404, + "rewards/reward_fn/std": 0.17433500289916992, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.005312391731146783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.018452511401847005, + "learning_rate": 7.91e-06, + "loss": 0.0007, + "num_tokens": 1152325.0, + "reward": 3.7299599647521973, + "reward_std": 0.6932061910629272, + "rewards/reward_fn/mean": 3.7299599647521973, + "rewards/reward_fn/std": 0.6932061910629272, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 258.34375, + "completions/mean_terminated_length": 258.34375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.005427878507910844, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.01829337512026541, + "learning_rate": 7.908e-06, + "loss": 0.0007, + "num_tokens": 1183792.0, + "reward": 2.912796974182129, + "reward_std": 0.2788362205028534, + "rewards/reward_fn/mean": 2.912796974182129, + "rewards/reward_fn/std": 0.2788361608982086, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 78.5625, + "completions/mean_terminated_length": 78.5625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.005543365284674905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.02331431234779302, + "learning_rate": 7.905999999999999e-06, + "loss": 0.0009, + "num_tokens": 1201058.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 121.3125, + "completions/mean_terminated_length": 121.3125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.005658852061438965, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.027614344653557055, + "learning_rate": 7.904e-06, + "loss": 0.0011, + "num_tokens": 1227116.0, + "reward": 3.8580849170684814, + "reward_std": 0.5586425065994263, + "rewards/reward_fn/mean": 3.8580849170684814, + "rewards/reward_fn/std": 0.5586425065994263, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 207.25, + "completions/mean_terminated_length": 207.25, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.005774338838203026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.01689005215303041, + "learning_rate": 7.902e-06, + "loss": 0.0007, + "num_tokens": 1256020.0, + "reward": 2.6436564922332764, + "reward_std": 0.19687579572200775, + "rewards/reward_fn/mean": 2.6436564922332764, + "rewards/reward_fn/std": 0.19687579572200775, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 153.28125, + "completions/mean_terminated_length": 153.28125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.005889825614967086, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.024671147955814376, + "learning_rate": 7.9e-06, + "loss": 0.001, + "num_tokens": 1285309.0, + "reward": 3.9295945167541504, + "reward_std": 0.39827290177345276, + "rewards/reward_fn/mean": 3.9295945167541504, + "rewards/reward_fn/std": 0.39827290177345276, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 65.625, + "completions/mean_terminated_length": 65.625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.006005312391731147, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.984375, + "kl": 0.02745252480963245, + "learning_rate": 7.898e-06, + "loss": 0.0011, + "num_tokens": 1305009.0, + "reward": 3.9630134105682373, + "reward_std": 0.2092277705669403, + "rewards/reward_fn/mean": 3.9630134105682373, + "rewards/reward_fn/std": 0.2092277854681015, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.006120799168495207, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.018287211743881926, + "learning_rate": 7.896e-06, + "loss": 0.0007, + "num_tokens": 1324165.0, + "reward": 3.862231969833374, + "reward_std": 0.5421114563941956, + "rewards/reward_fn/mean": 3.862231969833374, + "rewards/reward_fn/std": 0.5421113967895508, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 127.0, + "completions/mean_terminated_length": 127.0, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.006236285945259268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.023276412044651806, + "learning_rate": 7.894e-06, + "loss": 0.0009, + "num_tokens": 1345829.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 245.53125, + "completions/mean_terminated_length": 245.53125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.006351772722023329, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.01977733540115878, + "learning_rate": 7.892e-06, + "loss": 0.0008, + "num_tokens": 1377878.0, + "reward": 3.5521240234375, + "reward_std": 0.5645003318786621, + "rewards/reward_fn/mean": 3.5521240234375, + "rewards/reward_fn/std": 0.5645003318786621, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 127.125, + "completions/mean_terminated_length": 127.125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.0064672594987873885, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.0324286661343649, + "learning_rate": 7.889999999999999e-06, + "loss": 0.0013, + "num_tokens": 1402682.0, + "reward": 3.9663496017456055, + "reward_std": 0.1903550773859024, + "rewards/reward_fn/mean": 3.9663496017456055, + "rewards/reward_fn/std": 0.19035504758358002, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 120.3125, + "completions/mean_terminated_length": 120.3125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.006582746275551449, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.03136725671356544, + "learning_rate": 7.888e-06, + "loss": 0.0013, + "num_tokens": 1428836.0, + "reward": 3.8867995738983154, + "reward_std": 0.46809327602386475, + "rewards/reward_fn/mean": 3.8867995738983154, + "rewards/reward_fn/std": 0.46809327602386475, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 71.0625, + "completions/mean_terminated_length": 71.0625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.00669823305231551, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.84375, + "kl": 0.03153826453490183, + "learning_rate": 7.886e-06, + "loss": 0.0013, + "num_tokens": 1444614.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 215.46875, + "completions/mean_terminated_length": 215.46875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.006813719829079571, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.024573901057010517, + "learning_rate": 7.884e-06, + "loss": 0.001, + "num_tokens": 1472181.0, + "reward": 3.8645448684692383, + "reward_std": 0.43451815843582153, + "rewards/reward_fn/mean": 3.8645448684692383, + "rewards/reward_fn/std": 0.43451815843582153, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 212.5625, + "completions/mean_terminated_length": 212.5625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.006929206605843631, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.024366490717511624, + "learning_rate": 7.882e-06, + "loss": 0.001, + "num_tokens": 1491623.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 57.46875, + "completions/mean_terminated_length": 57.46875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.0070446933826076915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.390625, + "kl": 0.04157150277751498, + "learning_rate": 7.879999999999999e-06, + "loss": 0.0017, + "num_tokens": 1514326.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 239.03125, + "completions/mean_terminated_length": 239.03125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.007160180159371752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.020481250830926, + "learning_rate": 7.878e-06, + "loss": 0.0008, + "num_tokens": 1549623.0, + "reward": 2.899916172027588, + "reward_std": 0.3702060580253601, + "rewards/reward_fn/mean": 2.899916172027588, + "rewards/reward_fn/std": 0.3702060878276825, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 410.4375, + "completions/mean_terminated_length": 410.4375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.007275666936135812, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.014740779937710613, + "learning_rate": 7.876e-06, + "loss": 0.0006, + "num_tokens": 1572613.0, + "reward": 3.959090232849121, + "reward_std": 0.23142072558403015, + "rewards/reward_fn/mean": 3.959090232849121, + "rewards/reward_fn/std": 0.23142071068286896, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 94.34375, + "completions/mean_terminated_length": 94.34375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.007391153712899873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.271484375, + "kl": 0.04839042329695076, + "learning_rate": 7.874e-06, + "loss": 0.0019, + "num_tokens": 1594032.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 75.125, + "completions/mean_terminated_length": 75.125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.007506640489663934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.197265625, + "kl": 0.03093126681051217, + "learning_rate": 7.872e-06, + "loss": 0.0012, + "num_tokens": 1615476.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 68.59375, + "completions/mean_terminated_length": 68.59375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.007622127266427994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.024983717667055316, + "learning_rate": 7.87e-06, + "loss": 0.001, + "num_tokens": 1643687.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 76.46875, + "completions/mean_terminated_length": 76.46875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.007737614043192054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22265625, + "kl": 0.033868179394630715, + "learning_rate": 7.868e-06, + "loss": 0.0014, + "num_tokens": 1669046.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 232.6875, + "completions/mean_terminated_length": 232.6875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.007853100819956115, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.030561698134988546, + "learning_rate": 7.865999999999999e-06, + "loss": 0.0012, + "num_tokens": 1695692.0, + "reward": 3.9035732746124268, + "reward_std": 0.42798852920532227, + "rewards/reward_fn/mean": 3.9035732746124268, + "rewards/reward_fn/std": 0.42798852920532227, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 248.4375, + "completions/mean_terminated_length": 248.4375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.007968587596720176, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.022010388813214377, + "learning_rate": 7.864e-06, + "loss": 0.0009, + "num_tokens": 1720762.0, + "reward": 3.439931631088257, + "reward_std": 0.7190969586372375, + "rewards/reward_fn/mean": 3.439931631088257, + "rewards/reward_fn/std": 0.7190969586372375, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 328.5625, + "completions/mean_terminated_length": 328.5625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.008084074373484237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.01552356684987899, + "learning_rate": 7.862e-06, + "loss": 0.0006, + "num_tokens": 1742956.0, + "reward": 3.719529628753662, + "reward_std": 0.7539857625961304, + "rewards/reward_fn/mean": 3.719529628753662, + "rewards/reward_fn/std": 0.7539857625961304, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 104.4375, + "completions/mean_terminated_length": 104.4375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.008199561150248296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.197265625, + "kl": 0.03570799369481392, + "learning_rate": 7.86e-06, + "loss": 0.0014, + "num_tokens": 1764762.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 168.9375, + "completions/mean_terminated_length": 168.9375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.008315047927012357, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.020856424584053457, + "learning_rate": 7.858e-06, + "loss": 0.0008, + "num_tokens": 1792472.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1090.0, + "completions/mean_length": 689.6875, + "completions/mean_terminated_length": 645.8709716796875, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.008430534703776417, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.013052131660515442, + "learning_rate": 7.855999999999999e-06, + "loss": 0.0005, + "num_tokens": 1829934.0, + "reward": 2.511488437652588, + "reward_std": 1.223739743232727, + "rewards/reward_fn/mean": 2.511488437652588, + "rewards/reward_fn/std": 1.223739743232727, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 111.03125, + "completions/mean_terminated_length": 111.03125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.008546021480540478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.026283514132956043, + "learning_rate": 7.854e-06, + "loss": 0.0011, + "num_tokens": 1852719.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 151.65625, + "completions/mean_terminated_length": 151.65625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.008661508257304539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.028143005300080404, + "learning_rate": 7.852e-06, + "loss": 0.0011, + "num_tokens": 1869188.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 92.03125, + "completions/mean_terminated_length": 92.03125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.0087769950340686, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "kl": 0.03252672453527339, + "learning_rate": 7.85e-06, + "loss": 0.0013, + "num_tokens": 1889509.0, + "reward": 3.365122079849243, + "reward_std": 0.04244224727153778, + "rewards/reward_fn/mean": 3.365122079849243, + "rewards/reward_fn/std": 0.04244225099682808, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 236.25, + "completions/mean_terminated_length": 236.25, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.00889248181083266, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.019679279357660562, + "learning_rate": 7.848e-06, + "loss": 0.0008, + "num_tokens": 1912973.0, + "reward": 3.7178444862365723, + "reward_std": 0.7585012912750244, + "rewards/reward_fn/mean": 3.7178444862365723, + "rewards/reward_fn/std": 0.7585012912750244, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 206.84375, + "completions/mean_terminated_length": 206.84375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.00900796858759672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.0211777770309709, + "learning_rate": 7.845999999999999e-06, + "loss": 0.0008, + "num_tokens": 1942280.0, + "reward": 3.9411957263946533, + "reward_std": 0.2461879998445511, + "rewards/reward_fn/mean": 3.9411957263946533, + "rewards/reward_fn/std": 0.2461879849433899, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 104.71875, + "completions/mean_terminated_length": 104.71875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.00912345536436078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.021628671485814266, + "learning_rate": 7.844e-06, + "loss": 0.0009, + "num_tokens": 1968639.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 192.84375, + "completions/mean_terminated_length": 192.84375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.009238942141124841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.016186146152904257, + "learning_rate": 7.841999999999999e-06, + "loss": 0.0006, + "num_tokens": 1987450.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 421.0625, + "completions/mean_terminated_length": 421.0625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.009354428917888902, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.011859486519824713, + "learning_rate": 7.84e-06, + "loss": 0.0005, + "num_tokens": 2016092.0, + "reward": 3.9060776233673096, + "reward_std": 0.4047066271305084, + "rewards/reward_fn/mean": 3.9060776233673096, + "rewards/reward_fn/std": 0.4047066569328308, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 190.78125, + "completions/mean_terminated_length": 190.78125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.009469915694652963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.015096937451744452, + "learning_rate": 7.838e-06, + "loss": 0.0006, + "num_tokens": 2044885.0, + "reward": 3.082202911376953, + "reward_std": 0.3013756573200226, + "rewards/reward_fn/mean": 3.082202911376953, + "rewards/reward_fn/std": 0.3013756573200226, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 214.875, + "completions/mean_terminated_length": 214.875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.009585402471417023, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.024706971511477605, + "learning_rate": 7.836e-06, + "loss": 0.001, + "num_tokens": 2064113.0, + "reward": 3.2103025913238525, + "reward_std": 0.5064564347267151, + "rewards/reward_fn/mean": 3.2103025913238525, + "rewards/reward_fn/std": 0.5064564943313599, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 96.71875, + "completions/mean_terminated_length": 96.71875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.009700889248181084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.03117097503854893, + "learning_rate": 7.834e-06, + "loss": 0.0012, + "num_tokens": 2082984.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 120.78125, + "completions/mean_terminated_length": 120.78125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.009816376024945143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.021499120732187293, + "learning_rate": 7.831999999999999e-06, + "loss": 0.0009, + "num_tokens": 2098657.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 316.6875, + "completions/mean_terminated_length": 260.8387145996094, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.009931862801709204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.01617911219364032, + "learning_rate": 7.83e-06, + "loss": 0.0006, + "num_tokens": 2130775.0, + "reward": 2.9692015647888184, + "reward_std": 0.4408594071865082, + "rewards/reward_fn/mean": 2.9692015647888184, + "rewards/reward_fn/std": 0.4408594071865082, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 88.25, + "completions/mean_terminated_length": 88.25, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.010047349578473265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.009794390622118954, + "learning_rate": 7.828e-06, + "loss": 0.0004, + "num_tokens": 2150399.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 335.625, + "completions/mean_terminated_length": 335.625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.010162836355237325, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.013089064421365038, + "learning_rate": 7.826e-06, + "loss": 0.0005, + "num_tokens": 2185171.0, + "reward": 3.1257212162017822, + "reward_std": 0.6987590789794922, + "rewards/reward_fn/mean": 3.1257212162017822, + "rewards/reward_fn/std": 0.6987590193748474, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 112.03125, + "completions/mean_terminated_length": 112.03125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.010278323132001386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.01971272932132706, + "learning_rate": 7.824e-06, + "loss": 0.0008, + "num_tokens": 2200628.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 507.9375, + "completions/mean_terminated_length": 507.9375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.010393809908765447, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.01151612785179168, + "learning_rate": 7.821999999999999e-06, + "loss": 0.0005, + "num_tokens": 2226642.0, + "reward": 3.786811351776123, + "reward_std": 0.6734852194786072, + "rewards/reward_fn/mean": 3.786811351776123, + "rewards/reward_fn/std": 0.6734851598739624, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 92.84375, + "completions/mean_terminated_length": 92.84375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.010509296685529508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.016836186332511716, + "learning_rate": 7.82e-06, + "loss": 0.0007, + "num_tokens": 2252877.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1039.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 453.1875, + "completions/mean_terminated_length": 453.1875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.010624783462293567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.01494447427103296, + "learning_rate": 7.817999999999999e-06, + "loss": 0.0006, + "num_tokens": 2290451.0, + "reward": 2.8501486778259277, + "reward_std": 0.78495192527771, + "rewards/reward_fn/mean": 2.8501486778259277, + "rewards/reward_fn/std": 0.78495192527771, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 99.0625, + "completions/mean_terminated_length": 99.0625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.010740270239057628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.020569760163198225, + "learning_rate": 7.816e-06, + "loss": 0.0008, + "num_tokens": 2312725.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 340.5, + "completions/mean_terminated_length": 340.5, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.010855757015821688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.015206792741082609, + "learning_rate": 7.814e-06, + "loss": 0.0006, + "num_tokens": 2338597.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 322.96875, + "completions/mean_terminated_length": 267.32257080078125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.010971243792585749, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.014793651444051648, + "learning_rate": 7.812e-06, + "loss": 0.0006, + "num_tokens": 2372612.0, + "reward": 3.727499008178711, + "reward_std": 0.8200544118881226, + "rewards/reward_fn/mean": 3.727499008178711, + "rewards/reward_fn/std": 0.8200544118881226, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 93.15625, + "completions/mean_terminated_length": 93.15625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.01108673056934981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1796875, + "kl": 0.024640864270622842, + "learning_rate": 7.81e-06, + "loss": 0.001, + "num_tokens": 2393673.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 524.8125, + "completions/mean_terminated_length": 524.8125, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.01120221734611387, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.010376595790148713, + "learning_rate": 7.807999999999999e-06, + "loss": 0.0004, + "num_tokens": 2422531.0, + "reward": 3.345029354095459, + "reward_std": 1.0639269351959229, + "rewards/reward_fn/mean": 3.345029354095459, + "rewards/reward_fn/std": 1.0639269351959229, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 115.375, + "completions/mean_terminated_length": 115.375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.01131770412287793, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.019667836422740947, + "learning_rate": 7.806e-06, + "loss": 0.0008, + "num_tokens": 2445167.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 64.75, + "completions/mean_terminated_length": 64.75, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.01143319089964199, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.015384327300125733, + "learning_rate": 7.804e-06, + "loss": 0.0006, + "num_tokens": 2469191.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1477.0, + "completions/mean_length": 464.84375, + "completions/mean_terminated_length": 413.774169921875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.011548677676406051, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.010455664574692491, + "learning_rate": 7.802e-06, + "loss": 0.0004, + "num_tokens": 2507618.0, + "reward": 2.8724632263183594, + "reward_std": 0.7800498604774475, + "rewards/reward_fn/mean": 2.8724632263183594, + "rewards/reward_fn/std": 0.7800498604774475, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 248.34375, + "completions/mean_terminated_length": 248.34375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.011664164453170112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05126953125, + "kl": 0.012028088873194065, + "learning_rate": 7.8e-06, + "loss": 0.0005, + "num_tokens": 2531053.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 354.875, + "completions/mean_terminated_length": 354.875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.011779651229934173, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039306640625, + "kl": 0.008688887042808346, + "learning_rate": 7.797999999999999e-06, + "loss": 0.0003, + "num_tokens": 2557929.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 223.65625, + "completions/mean_terminated_length": 223.65625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.011895138006698234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.016532731475308537, + "learning_rate": 7.796e-06, + "loss": 0.0007, + "num_tokens": 2591806.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 216.03125, + "completions/mean_terminated_length": 216.03125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.012010624783462294, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.015770250523928553, + "learning_rate": 7.793999999999999e-06, + "loss": 0.0006, + "num_tokens": 2611391.0, + "reward": 3.972655773162842, + "reward_std": 0.1546824425458908, + "rewards/reward_fn/mean": 3.972655773162842, + "rewards/reward_fn/std": 0.15468242764472961, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 97.40625, + "completions/mean_terminated_length": 97.40625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.012126111560226353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1591796875, + "kl": 0.0254394389630761, + "learning_rate": 7.792e-06, + "loss": 0.001, + "num_tokens": 2637676.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 86.9375, + "completions/mean_terminated_length": 86.9375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.012241598336990414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.01503068449528655, + "learning_rate": 7.79e-06, + "loss": 0.0006, + "num_tokens": 2658090.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 62.15625, + "completions/mean_terminated_length": 62.15625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.012357085113754475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.017051633825758472, + "learning_rate": 7.788e-06, + "loss": 0.0007, + "num_tokens": 2675791.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 108.34375, + "completions/mean_terminated_length": 108.34375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.012472571890518536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.018019895971519873, + "learning_rate": 7.786e-06, + "loss": 0.0007, + "num_tokens": 2697434.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 136.8125, + "completions/mean_terminated_length": 136.8125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.012588058667282596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.018596080102724954, + "learning_rate": 7.783999999999999e-06, + "loss": 0.0007, + "num_tokens": 2723860.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 244.65625, + "completions/mean_terminated_length": 244.65625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.012703545444046657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.014697418868308887, + "learning_rate": 7.782e-06, + "loss": 0.0006, + "num_tokens": 2755241.0, + "reward": 3.6643576622009277, + "reward_std": 0.6930525898933411, + "rewards/reward_fn/mean": 3.6643576622009277, + "rewards/reward_fn/std": 0.6930525898933411, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 170.46875, + "completions/mean_terminated_length": 170.46875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.012819032220810718, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.015238240972394124, + "learning_rate": 7.78e-06, + "loss": 0.0006, + "num_tokens": 2774584.0, + "reward": 3.93694806098938, + "reward_std": 0.2528596818447113, + "rewards/reward_fn/mean": 3.93694806098938, + "rewards/reward_fn/std": 0.2528596520423889, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 278.84375, + "completions/mean_terminated_length": 278.84375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.012934518997574777, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.013981461539515294, + "learning_rate": 7.777999999999999e-06, + "loss": 0.0006, + "num_tokens": 2810163.0, + "reward": 3.0274157524108887, + "reward_std": 0.501778244972229, + "rewards/reward_fn/mean": 3.0274157524108887, + "rewards/reward_fn/std": 0.501778244972229, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 62.40625, + "completions/mean_terminated_length": 62.40625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.013050005774338838, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.78125, + "kl": 0.010410311500891112, + "learning_rate": 7.776e-06, + "loss": 0.0004, + "num_tokens": 2835712.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 134.8125, + "completions/mean_terminated_length": 134.8125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.013165492551102899, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.02155936587951146, + "learning_rate": 7.774e-06, + "loss": 0.0009, + "num_tokens": 2852698.0, + "reward": 3.9293909072875977, + "reward_std": 0.39942461252212524, + "rewards/reward_fn/mean": 3.9293909072875977, + "rewards/reward_fn/std": 0.39942455291748047, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 135.8125, + "completions/mean_terminated_length": 135.8125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.01328097932786696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.014890697624650784, + "learning_rate": 7.772e-06, + "loss": 0.0006, + "num_tokens": 2881204.0, + "reward": 3.968928337097168, + "reward_std": 0.17576861381530762, + "rewards/reward_fn/mean": 3.968928337097168, + "rewards/reward_fn/std": 0.1757686287164688, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 169.4375, + "completions/mean_terminated_length": 169.4375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.01339646610463102, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.010506000289751682, + "learning_rate": 7.769999999999998e-06, + "loss": 0.0004, + "num_tokens": 2895362.0, + "reward": 2.929551601409912, + "reward_std": 0.06777659803628922, + "rewards/reward_fn/mean": 2.929551601409912, + "rewards/reward_fn/std": 0.06777660548686981, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 302.09375, + "completions/mean_terminated_length": 302.09375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.013511952881395081, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.00858928075467702, + "learning_rate": 7.767999999999999e-06, + "loss": 0.0003, + "num_tokens": 2920069.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 292.1875, + "completions/mean_terminated_length": 292.1875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.013627439658159142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.00872733339201659, + "learning_rate": 7.766e-06, + "loss": 0.0003, + "num_tokens": 2945579.0, + "reward": 3.85664701461792, + "reward_std": 0.5641152262687683, + "rewards/reward_fn/mean": 3.85664701461792, + "rewards/reward_fn/std": 0.5641151666641235, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 104.53125, + "completions/mean_terminated_length": 104.53125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.0137429264349232, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.018312103711650707, + "learning_rate": 7.764e-06, + "loss": 0.0007, + "num_tokens": 2971260.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 202.34375, + "completions/mean_terminated_length": 202.34375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.013858413211687261, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05859375, + "kl": 0.010406906054413412, + "learning_rate": 7.762e-06, + "loss": 0.0004, + "num_tokens": 2992423.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 146.84375, + "completions/mean_terminated_length": 146.84375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.013973899988451322, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.017725229496136308, + "learning_rate": 7.76e-06, + "loss": 0.0007, + "num_tokens": 3009282.0, + "reward": 3.9311132431030273, + "reward_std": 0.3896828591823578, + "rewards/reward_fn/mean": 3.9311132431030273, + "rewards/reward_fn/std": 0.3896828591823578, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 216.71875, + "completions/mean_terminated_length": 216.71875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.014089386765215383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.015904723972198553, + "learning_rate": 7.758e-06, + "loss": 0.0006, + "num_tokens": 3026681.0, + "reward": 3.9332802295684814, + "reward_std": 0.3774241805076599, + "rewards/reward_fn/mean": 3.9332802295684814, + "rewards/reward_fn/std": 0.37742412090301514, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 210.96875, + "completions/mean_terminated_length": 210.96875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.014204873541979444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.016583141055889428, + "learning_rate": 7.756e-06, + "loss": 0.0007, + "num_tokens": 3052888.0, + "reward": 3.7604129314422607, + "reward_std": 0.39222797751426697, + "rewards/reward_fn/mean": 3.7604129314422607, + "rewards/reward_fn/std": 0.39222797751426697, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 205.0, + "completions/mean_terminated_length": 205.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.014320360318743505, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.014824314479483292, + "learning_rate": 7.753999999999999e-06, + "loss": 0.0006, + "num_tokens": 3084824.0, + "reward": 3.58933687210083, + "reward_std": 0.44506555795669556, + "rewards/reward_fn/mean": 3.58933687210083, + "rewards/reward_fn/std": 0.44506552815437317, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 121.34375, + "completions/mean_terminated_length": 121.34375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.014435847095507564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.019597054124460556, + "learning_rate": 7.752e-06, + "loss": 0.0008, + "num_tokens": 3109283.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 220.90625, + "completions/mean_terminated_length": 220.90625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.014551333872271624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.013208292002673261, + "learning_rate": 7.75e-06, + "loss": 0.0005, + "num_tokens": 3124480.0, + "reward": 3.8604013919830322, + "reward_std": 0.5493434071540833, + "rewards/reward_fn/mean": 3.8604013919830322, + "rewards/reward_fn/std": 0.5493433475494385, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 221.59375, + "completions/mean_terminated_length": 221.59375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.014666820649035685, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.014523693433147855, + "learning_rate": 7.748e-06, + "loss": 0.0006, + "num_tokens": 3142643.0, + "reward": 3.928016424179077, + "reward_std": 0.40720096230506897, + "rewards/reward_fn/mean": 3.928016424179077, + "rewards/reward_fn/std": 0.40720096230506897, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 95.375, + "completions/mean_terminated_length": 95.375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.014782307425799746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.019759700269787572, + "learning_rate": 7.746e-06, + "loss": 0.0008, + "num_tokens": 3158239.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 143.21875, + "completions/mean_terminated_length": 143.21875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.014897794202563807, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.020622563402866945, + "learning_rate": 7.743999999999999e-06, + "loss": 0.0008, + "num_tokens": 3180294.0, + "reward": 3.862762451171875, + "reward_std": 0.5400276184082031, + "rewards/reward_fn/mean": 3.862762451171875, + "rewards/reward_fn/std": 0.5400275588035583, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 173.1875, + "completions/mean_terminated_length": 173.1875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.015013280979327867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.014052953687496483, + "learning_rate": 7.742e-06, + "loss": 0.0006, + "num_tokens": 3208908.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 149.53125, + "completions/mean_terminated_length": 149.53125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.015128767756091928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.01584467747306917, + "learning_rate": 7.74e-06, + "loss": 0.0006, + "num_tokens": 3235005.0, + "reward": 3.9316000938415527, + "reward_std": 0.38692882657051086, + "rewards/reward_fn/mean": 3.9316000938415527, + "rewards/reward_fn/std": 0.3869287967681885, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 194.40625, + "completions/mean_terminated_length": 194.40625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.015244254532855987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.014747205743333325, + "learning_rate": 7.738e-06, + "loss": 0.0006, + "num_tokens": 3253930.0, + "reward": 3.9300718307495117, + "reward_std": 0.39557281136512756, + "rewards/reward_fn/mean": 3.9300718307495117, + "rewards/reward_fn/std": 0.3955727815628052, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 201.53125, + "completions/mean_terminated_length": 201.53125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.015359741309620048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.01172824545938056, + "learning_rate": 7.736e-06, + "loss": 0.0005, + "num_tokens": 3282747.0, + "reward": 3.133096694946289, + "reward_std": 0.06656574457883835, + "rewards/reward_fn/mean": 3.133096694946289, + "rewards/reward_fn/std": 0.06656574457883835, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 290.4375, + "completions/mean_terminated_length": 290.4375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.015475228086384109, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.012955484591657296, + "learning_rate": 7.733999999999999e-06, + "loss": 0.0005, + "num_tokens": 3306921.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 97.6875, + "completions/mean_terminated_length": 97.6875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.01559071486314817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1650390625, + "kl": 0.019436730508459732, + "learning_rate": 7.732e-06, + "loss": 0.0008, + "num_tokens": 3329599.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 109.125, + "completions/mean_terminated_length": 109.125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.01570620163991223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.01739132360671647, + "learning_rate": 7.73e-06, + "loss": 0.0007, + "num_tokens": 3351523.0, + "reward": 3.980455160140991, + "reward_std": 0.11056230217218399, + "rewards/reward_fn/mean": 3.980455160140991, + "rewards/reward_fn/std": 0.11056230962276459, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 152.28125, + "completions/mean_terminated_length": 152.28125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.01582168841667629, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.025648201757576317, + "learning_rate": 7.728e-06, + "loss": 0.001, + "num_tokens": 3374412.0, + "reward": 3.2180051803588867, + "reward_std": 0.17951545119285583, + "rewards/reward_fn/mean": 3.2180051803588867, + "rewards/reward_fn/std": 0.17951543629169464, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 158.40625, + "completions/mean_terminated_length": 158.40625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.015937175193440352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.01602337954682298, + "learning_rate": 7.726e-06, + "loss": 0.0006, + "num_tokens": 3402681.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 216.0625, + "completions/mean_terminated_length": 216.0625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.016052661970204413, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.016065287258243188, + "learning_rate": 7.724e-06, + "loss": 0.0006, + "num_tokens": 3428731.0, + "reward": 3.36491060256958, + "reward_std": 0.4419271945953369, + "rewards/reward_fn/mean": 3.36491060256958, + "rewards/reward_fn/std": 0.4419271945953369, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 170.59375, + "completions/mean_terminated_length": 170.59375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.016168148746968473, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.025451386653003283, + "learning_rate": 7.722e-06, + "loss": 0.001, + "num_tokens": 3454862.0, + "reward": 3.790025234222412, + "reward_std": 0.4711700677871704, + "rewards/reward_fn/mean": 3.790025234222412, + "rewards/reward_fn/std": 0.4711700975894928, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 268.59375, + "completions/mean_terminated_length": 268.59375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.016283635523732534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.015362554244347848, + "learning_rate": 7.719999999999999e-06, + "loss": 0.0006, + "num_tokens": 3478529.0, + "reward": 3.9297032356262207, + "reward_std": 0.39765939116477966, + "rewards/reward_fn/mean": 3.9297032356262207, + "rewards/reward_fn/std": 0.39765939116477966, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 287.3125, + "completions/mean_terminated_length": 230.51612854003906, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.01639912230049659, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.019993634006823413, + "learning_rate": 7.718e-06, + "loss": 0.0008, + "num_tokens": 3509099.0, + "reward": 2.751225471496582, + "reward_std": 0.9610175490379333, + "rewards/reward_fn/mean": 2.751225471496582, + "rewards/reward_fn/std": 0.9610175490379333, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 220.8125, + "completions/mean_terminated_length": 220.8125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.016514609077260652, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.01275358148268424, + "learning_rate": 7.716e-06, + "loss": 0.0005, + "num_tokens": 3542725.0, + "reward": 3.0948753356933594, + "reward_std": 1.1274739503860474, + "rewards/reward_fn/mean": 3.0948753356933594, + "rewards/reward_fn/std": 1.1274738311767578, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 70.6875, + "completions/mean_terminated_length": 70.6875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.016630095854024713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.017413303678040393, + "learning_rate": 7.714e-06, + "loss": 0.0007, + "num_tokens": 3559099.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 155.03125, + "completions/mean_terminated_length": 155.03125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.016745582630788774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.020352868450572714, + "learning_rate": 7.712e-06, + "loss": 0.0008, + "num_tokens": 3587676.0, + "reward": 3.310816526412964, + "reward_std": 0.08747676014900208, + "rewards/reward_fn/mean": 3.310816526412964, + "rewards/reward_fn/std": 0.08747676014900208, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 235.78125, + "completions/mean_terminated_length": 235.78125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.016861069407552835, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.012610627294634469, + "learning_rate": 7.709999999999999e-06, + "loss": 0.0005, + "num_tokens": 3621525.0, + "reward": 3.7740535736083984, + "reward_std": 0.601483941078186, + "rewards/reward_fn/mean": 3.7740535736083984, + "rewards/reward_fn/std": 0.601483941078186, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 92.125, + "completions/mean_terminated_length": 92.125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.016976556184316895, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.020985419374483172, + "learning_rate": 7.708e-06, + "loss": 0.0008, + "num_tokens": 3632537.0, + "reward": 3.4343881607055664, + "reward_std": 0.09288819134235382, + "rewards/reward_fn/mean": 3.4343881607055664, + "rewards/reward_fn/std": 0.09288818389177322, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 72.4375, + "completions/mean_terminated_length": 72.4375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.017092042961080956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17578125, + "kl": 0.013261507643619552, + "learning_rate": 7.706e-06, + "loss": 0.0005, + "num_tokens": 3649799.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 86.21875, + "completions/mean_terminated_length": 86.21875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.017207529737845017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.014660751301562414, + "learning_rate": 7.704e-06, + "loss": 0.0006, + "num_tokens": 3667310.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 98.28125, + "completions/mean_terminated_length": 98.28125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.017323016514609078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.0183602440811228, + "learning_rate": 7.702e-06, + "loss": 0.0007, + "num_tokens": 3684855.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 129.96875, + "completions/mean_terminated_length": 129.96875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.01743850329137314, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.02099324819573667, + "learning_rate": 7.699999999999999e-06, + "loss": 0.0008, + "num_tokens": 3698614.0, + "reward": 3.089387893676758, + "reward_std": 0.24362511932849884, + "rewards/reward_fn/mean": 3.089387893676758, + "rewards/reward_fn/std": 0.24362513422966003, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 55.34375, + "completions/mean_terminated_length": 55.34375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.0175539900681372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.224609375, + "kl": 0.02759716412401758, + "learning_rate": 7.698e-06, + "loss": 0.0011, + "num_tokens": 3724033.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 280.90625, + "completions/mean_terminated_length": 280.90625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.01766947684490126, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.017909545684233308, + "learning_rate": 7.695999999999999e-06, + "loss": 0.0007, + "num_tokens": 3751390.0, + "reward": 3.9157190322875977, + "reward_std": 0.27629175782203674, + "rewards/reward_fn/mean": 3.9157190322875977, + "rewards/reward_fn/std": 0.27629172801971436, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 317.65625, + "completions/mean_terminated_length": 317.65625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.01778496362166532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.014282223433838226, + "learning_rate": 7.694e-06, + "loss": 0.0006, + "num_tokens": 3783699.0, + "reward": 3.4250197410583496, + "reward_std": 0.6094458103179932, + "rewards/reward_fn/mean": 3.4250197410583496, + "rewards/reward_fn/std": 0.6094458103179932, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 300.84375, + "completions/mean_terminated_length": 300.84375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.01790045039842938, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.013379654570599087, + "learning_rate": 7.692e-06, + "loss": 0.0005, + "num_tokens": 3806382.0, + "reward": 2.7548890113830566, + "reward_std": 0.04573509842157364, + "rewards/reward_fn/mean": 2.7548890113830566, + "rewards/reward_fn/std": 0.04573511332273483, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 108.21875, + "completions/mean_terminated_length": 108.21875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.01801593717519344, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.029635253100423142, + "learning_rate": 7.69e-06, + "loss": 0.0012, + "num_tokens": 3830965.0, + "reward": 3.3282418251037598, + "reward_std": 0.0620730035007, + "rewards/reward_fn/mean": 3.3282418251037598, + "rewards/reward_fn/std": 0.06207301840186119, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 68.125, + "completions/mean_terminated_length": 68.125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.0181314239519575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.171875, + "kl": 0.01991251268191263, + "learning_rate": 7.688e-06, + "loss": 0.0008, + "num_tokens": 3848953.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 181.84375, + "completions/mean_terminated_length": 181.84375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.01824691072872156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.017111980705522, + "learning_rate": 7.685999999999999e-06, + "loss": 0.0007, + "num_tokens": 3862804.0, + "reward": 3.8633131980895996, + "reward_std": 0.4741106927394867, + "rewards/reward_fn/mean": 3.8633131980895996, + "rewards/reward_fn/std": 0.4741107225418091, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 126.71875, + "completions/mean_terminated_length": 126.71875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.01836239750548562, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.024297067458974198, + "learning_rate": 7.684e-06, + "loss": 0.001, + "num_tokens": 3883051.0, + "reward": 3.9712982177734375, + "reward_std": 0.16236194968223572, + "rewards/reward_fn/mean": 3.9712982177734375, + "rewards/reward_fn/std": 0.1623619645833969, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 76.28125, + "completions/mean_terminated_length": 76.28125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.018477884282249682, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.020389388024341315, + "learning_rate": 7.682e-06, + "loss": 0.0008, + "num_tokens": 3903444.0, + "reward": 3.9637649059295654, + "reward_std": 0.20497699081897736, + "rewards/reward_fn/mean": 3.9637649059295654, + "rewards/reward_fn/std": 0.20497702062129974, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 292.0625, + "completions/mean_terminated_length": 292.0625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.018593371059013743, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.01728770465706475, + "learning_rate": 7.68e-06, + "loss": 0.0007, + "num_tokens": 3929814.0, + "reward": 3.55471134185791, + "reward_std": 0.7556344270706177, + "rewards/reward_fn/mean": 3.55471134185791, + "rewards/reward_fn/std": 0.7556343674659729, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 264.15625, + "completions/mean_terminated_length": 264.15625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.018708857835777803, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.02223204346955754, + "learning_rate": 7.678e-06, + "loss": 0.0009, + "num_tokens": 3955323.0, + "reward": 3.669635534286499, + "reward_std": 0.5098779797554016, + "rewards/reward_fn/mean": 3.669635534286499, + "rewards/reward_fn/std": 0.5098779797554016, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 244.0, + "completions/mean_terminated_length": 244.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.018824344612541864, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.019092816684860736, + "learning_rate": 7.675999999999999e-06, + "loss": 0.0008, + "num_tokens": 3990875.0, + "reward": 3.2623233795166016, + "reward_std": 0.4456377625465393, + "rewards/reward_fn/mean": 3.2623233795166016, + "rewards/reward_fn/std": 0.4456377625465393, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.018939831389305925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.018806629959726706, + "learning_rate": 7.674e-06, + "loss": 0.0008, + "num_tokens": 4011025.0, + "reward": 3.719357490539551, + "reward_std": 0.7545589208602905, + "rewards/reward_fn/mean": 3.719357490539551, + "rewards/reward_fn/std": 0.7545588612556458, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 303.34375, + "completions/mean_terminated_length": 303.34375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.019055318166069986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.01940413028933108, + "learning_rate": 7.671999999999999e-06, + "loss": 0.0008, + "num_tokens": 4033308.0, + "reward": 3.9284942150115967, + "reward_std": 0.404497891664505, + "rewards/reward_fn/mean": 3.9284942150115967, + "rewards/reward_fn/std": 0.404497891664505, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 163.625, + "completions/mean_terminated_length": 163.625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.019170804942834047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.031739470607135445, + "learning_rate": 7.67e-06, + "loss": 0.0013, + "num_tokens": 4050768.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.019286291719598107, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.017154234199551865, + "learning_rate": 7.668e-06, + "loss": 0.0007, + "num_tokens": 4075103.0, + "reward": 3.930649757385254, + "reward_std": 0.39230385422706604, + "rewards/reward_fn/mean": 3.930649757385254, + "rewards/reward_fn/std": 0.39230382442474365, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 391.25, + "completions/mean_terminated_length": 391.25, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.019401778496362168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.01577149744844064, + "learning_rate": 7.666e-06, + "loss": 0.0006, + "num_tokens": 4112071.0, + "reward": 3.180412530899048, + "reward_std": 0.7320826649665833, + "rewards/reward_fn/mean": 3.180412530899048, + "rewards/reward_fn/std": 0.7320825457572937, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 330.8125, + "completions/mean_terminated_length": 330.8125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.019517265273126225, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.019775311928242445, + "learning_rate": 7.664e-06, + "loss": 0.0008, + "num_tokens": 4137953.0, + "reward": 3.786625385284424, + "reward_std": 0.6740304231643677, + "rewards/reward_fn/mean": 3.786625385284424, + "rewards/reward_fn/std": 0.6740304827690125, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 153.78125, + "completions/mean_terminated_length": 153.78125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.019632752049890286, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.03573286079335958, + "learning_rate": 7.661999999999999e-06, + "loss": 0.0014, + "num_tokens": 4164698.0, + "reward": 3.0641984939575195, + "reward_std": 0.09806602448225021, + "rewards/reward_fn/mean": 3.0641984939575195, + "rewards/reward_fn/std": 0.09806601703166962, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 160.25, + "completions/mean_terminated_length": 160.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.019748238826654347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.022935522778425366, + "learning_rate": 7.66e-06, + "loss": 0.0009, + "num_tokens": 4184834.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 172.65625, + "completions/mean_terminated_length": 172.65625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.019863725603418408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1611328125, + "kl": 0.024963288567960262, + "learning_rate": 7.658e-06, + "loss": 0.001, + "num_tokens": 4211383.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 84.53125, + "completions/mean_terminated_length": 84.53125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.01997921238018247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.02528808820352424, + "learning_rate": 7.656e-06, + "loss": 0.001, + "num_tokens": 4223464.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 190.71875, + "completions/mean_terminated_length": 190.71875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.02009469915694653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.020916100693284534, + "learning_rate": 7.654e-06, + "loss": 0.0008, + "num_tokens": 4244575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 103.84375, + "completions/mean_terminated_length": 103.84375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.02021018593371059, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.03350284043699503, + "learning_rate": 7.652e-06, + "loss": 0.0013, + "num_tokens": 4261466.0, + "reward": 3.971558094024658, + "reward_std": 0.16089169681072235, + "rewards/reward_fn/mean": 3.971558094024658, + "rewards/reward_fn/std": 0.16089168190956116, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 119.75, + "completions/mean_terminated_length": 119.75, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.02032567271047465, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.03560122271301225, + "learning_rate": 7.65e-06, + "loss": 0.0014, + "num_tokens": 4278034.0, + "reward": 3.9096474647521973, + "reward_std": 0.2883630692958832, + "rewards/reward_fn/mean": 3.9096474647521973, + "rewards/reward_fn/std": 0.2883630394935608, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 115.65625, + "completions/mean_terminated_length": 115.65625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.02044115948723871, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1865234375, + "kl": 0.03134759716340341, + "learning_rate": 7.647999999999999e-06, + "loss": 0.0013, + "num_tokens": 4304231.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 349.125, + "completions/mean_terminated_length": 349.125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.020556646264002772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.020236187061527744, + "learning_rate": 7.646e-06, + "loss": 0.0008, + "num_tokens": 4336619.0, + "reward": 2.9808521270751953, + "reward_std": 0.2878401279449463, + "rewards/reward_fn/mean": 2.9808521270751953, + "rewards/reward_fn/std": 0.2878401577472687, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 228.15625, + "completions/mean_terminated_length": 228.15625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.020672133040766833, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.020011651547974907, + "learning_rate": 7.644e-06, + "loss": 0.0008, + "num_tokens": 4367120.0, + "reward": 3.3542399406433105, + "reward_std": 0.4304291009902954, + "rewards/reward_fn/mean": 3.3542399406433105, + "rewards/reward_fn/std": 0.430429071187973, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 128.0625, + "completions/mean_terminated_length": 128.0625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.020787619817530894, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.025502130782115273, + "learning_rate": 7.642e-06, + "loss": 0.001, + "num_tokens": 4384082.0, + "reward": 3.562398910522461, + "reward_std": 0.40100839734077454, + "rewards/reward_fn/mean": 3.562398910522461, + "rewards/reward_fn/std": 0.4010084271430969, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 104.875, + "completions/mean_terminated_length": 104.875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.020903106594294955, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.01644925591244828, + "learning_rate": 7.64e-06, + "loss": 0.0007, + "num_tokens": 4415758.0, + "reward": 3.753542900085449, + "reward_std": 0.4343796968460083, + "rewards/reward_fn/mean": 3.753542900085449, + "rewards/reward_fn/std": 0.4343796670436859, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 208.625, + "completions/mean_terminated_length": 208.625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.021018593371059015, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.018285699290572666, + "learning_rate": 7.638e-06, + "loss": 0.0007, + "num_tokens": 4441794.0, + "reward": 3.8973772525787354, + "reward_std": 0.43246039748191833, + "rewards/reward_fn/mean": 3.8973772525787354, + "rewards/reward_fn/std": 0.43246039748191833, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 83.9375, + "completions/mean_terminated_length": 83.9375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.021134080147823073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.014482722210232168, + "learning_rate": 7.636e-06, + "loss": 0.0006, + "num_tokens": 4459936.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 55.65625, + "completions/mean_terminated_length": 55.65625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.021249566924587134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.010199909316725098, + "learning_rate": 7.634e-06, + "loss": 0.0004, + "num_tokens": 4491093.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 264.65625, + "completions/mean_terminated_length": 264.65625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.021365053701351194, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.012682401356869377, + "learning_rate": 7.631999999999999e-06, + "loss": 0.0005, + "num_tokens": 4519178.0, + "reward": 3.8302412033081055, + "reward_std": 0.5768439173698425, + "rewards/reward_fn/mean": 3.8302412033081055, + "rewards/reward_fn/std": 0.5768439769744873, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 317.0625, + "completions/mean_terminated_length": 317.0625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.021480540478115255, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.017839080974226817, + "learning_rate": 7.63e-06, + "loss": 0.0007, + "num_tokens": 4542156.0, + "reward": 3.8544247150421143, + "reward_std": 0.572951078414917, + "rewards/reward_fn/mean": 3.8544247150421143, + "rewards/reward_fn/std": 0.572951078414917, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 78.59375, + "completions/mean_terminated_length": 78.59375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.021596027254879316, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.984375, + "kl": 0.028004055639030412, + "learning_rate": 7.628e-06, + "loss": 0.0011, + "num_tokens": 4563103.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 317.0, + "completions/mean_terminated_length": 317.0, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.021711514031643377, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.015271464217221364, + "learning_rate": 7.626e-06, + "loss": 0.0006, + "num_tokens": 4585215.0, + "reward": 2.9311952590942383, + "reward_std": 1.1564152240753174, + "rewards/reward_fn/mean": 2.9311952590942383, + "rewards/reward_fn/std": 1.1564151048660278, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 329.875, + "completions/mean_terminated_length": 329.875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.021827000808407437, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.015978061652276665, + "learning_rate": 7.623999999999999e-06, + "loss": 0.0006, + "num_tokens": 4608091.0, + "reward": 3.861049175262451, + "reward_std": 0.5468943119049072, + "rewards/reward_fn/mean": 3.861049175262451, + "rewards/reward_fn/std": 0.5468943119049072, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 162.5, + "completions/mean_terminated_length": 162.5, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.021942487585171498, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.02364223130280152, + "learning_rate": 7.621999999999999e-06, + "loss": 0.0009, + "num_tokens": 4624747.0, + "reward": 3.927245855331421, + "reward_std": 0.41155943274497986, + "rewards/reward_fn/mean": 3.927245855331421, + "rewards/reward_fn/std": 0.4115593731403351, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 132.5625, + "completions/mean_terminated_length": 132.5625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.02205797436193556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1806640625, + "kl": 0.0313702875864692, + "learning_rate": 7.62e-06, + "loss": 0.0013, + "num_tokens": 4645181.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 316.53125, + "completions/mean_terminated_length": 316.53125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.02217346113869962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.014644406852312386, + "learning_rate": 7.618e-06, + "loss": 0.0006, + "num_tokens": 4671342.0, + "reward": 3.5007386207580566, + "reward_std": 0.9586796164512634, + "rewards/reward_fn/mean": 3.5007386207580566, + "rewards/reward_fn/std": 0.9586796164512634, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 125.9375, + "completions/mean_terminated_length": 125.9375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.02228894791546368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.019510644633555785, + "learning_rate": 7.616e-06, + "loss": 0.0008, + "num_tokens": 4688780.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 170.09375, + "completions/mean_terminated_length": 170.09375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.02240443469222774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.015486361302464502, + "learning_rate": 7.6139999999999994e-06, + "loss": 0.0006, + "num_tokens": 4706863.0, + "reward": 1.840867519378662, + "reward_std": 0.485943466424942, + "rewards/reward_fn/mean": 1.840867519378662, + "rewards/reward_fn/std": 0.485943466424942, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 392.4375, + "completions/mean_terminated_length": 392.4375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.022519921468991802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.012444564432371408, + "learning_rate": 7.612e-06, + "loss": 0.0005, + "num_tokens": 4741757.0, + "reward": 3.736908197402954, + "reward_std": 0.5126994252204895, + "rewards/reward_fn/mean": 3.736908197402954, + "rewards/reward_fn/std": 0.5126993656158447, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 205.71875, + "completions/mean_terminated_length": 205.71875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.02263540824575586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.020089497498702258, + "learning_rate": 7.61e-06, + "loss": 0.0008, + "num_tokens": 4766484.0, + "reward": 3.428215980529785, + "reward_std": 0.8355299234390259, + "rewards/reward_fn/mean": 3.428215980529785, + "rewards/reward_fn/std": 0.8355298638343811, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 190.65625, + "completions/mean_terminated_length": 190.65625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.02275089502251992, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.022923822980374098, + "learning_rate": 7.607999999999999e-06, + "loss": 0.0009, + "num_tokens": 4781513.0, + "reward": 3.8592920303344727, + "reward_std": 0.5536801815032959, + "rewards/reward_fn/mean": 3.8592920303344727, + "rewards/reward_fn/std": 0.5536801815032959, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 329.5, + "completions/mean_terminated_length": 329.5, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.02286638179928398, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.021412867528852075, + "learning_rate": 7.606e-06, + "loss": 0.0009, + "num_tokens": 4816217.0, + "reward": 3.214181423187256, + "reward_std": 0.5220996737480164, + "rewards/reward_fn/mean": 3.214181423187256, + "rewards/reward_fn/std": 0.5220996737480164, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 54.65625, + "completions/mean_terminated_length": 54.65625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.02298186857604804, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.609375, + "kl": 0.015518083790084347, + "learning_rate": 7.6039999999999995e-06, + "loss": 0.0006, + "num_tokens": 4837038.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 178.5, + "completions/mean_terminated_length": 178.5, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.023097355352812102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.01865798699145671, + "learning_rate": 7.602e-06, + "loss": 0.0007, + "num_tokens": 4867966.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 92.3125, + "completions/mean_terminated_length": 92.3125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.023212842129576163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.017468365229433402, + "learning_rate": 7.599999999999999e-06, + "loss": 0.0007, + "num_tokens": 4882824.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 110.4375, + "completions/mean_terminated_length": 110.4375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.023328328906340224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.01166692625702126, + "learning_rate": 7.598e-06, + "loss": 0.0005, + "num_tokens": 4903414.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 115.125, + "completions/mean_terminated_length": 115.125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.023443815683104285, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.029398989951005206, + "learning_rate": 7.596e-06, + "loss": 0.0012, + "num_tokens": 4919866.0, + "reward": 3.931021213531494, + "reward_std": 0.39020276069641113, + "rewards/reward_fn/mean": 3.931021213531494, + "rewards/reward_fn/std": 0.39020267128944397, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 158.875, + "completions/mean_terminated_length": 158.875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.023559302459868346, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.01831955526722595, + "learning_rate": 7.594e-06, + "loss": 0.0007, + "num_tokens": 4937270.0, + "reward": 3.541457414627075, + "reward_std": 0.49891048669815063, + "rewards/reward_fn/mean": 3.541457414627075, + "rewards/reward_fn/std": 0.498910516500473, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 111.78125, + "completions/mean_terminated_length": 111.78125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.023674789236632406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.025127143191639334, + "learning_rate": 7.5919999999999995e-06, + "loss": 0.001, + "num_tokens": 4956655.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 100.3125, + "completions/mean_terminated_length": 100.3125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.023790276013396467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.015294924000045285, + "learning_rate": 7.589999999999999e-06, + "loss": 0.0006, + "num_tokens": 4975705.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 110.96875, + "completions/mean_terminated_length": 110.96875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.023905762790160528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.01279439853533404, + "learning_rate": 7.588e-06, + "loss": 0.0005, + "num_tokens": 4997880.0, + "reward": 3.8610382080078125, + "reward_std": 0.5468087196350098, + "rewards/reward_fn/mean": 3.8610382080078125, + "rewards/reward_fn/std": 0.5468087196350098, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 275.6875, + "completions/mean_terminated_length": 275.6875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.02402124956692459, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.017334686970571056, + "learning_rate": 7.586e-06, + "loss": 0.0007, + "num_tokens": 5030574.0, + "reward": 3.839310884475708, + "reward_std": 0.4799947738647461, + "rewards/reward_fn/mean": 3.839310884475708, + "rewards/reward_fn/std": 0.4799947440624237, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 181.03125, + "completions/mean_terminated_length": 181.03125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.02413673634368865, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.018830131011782214, + "learning_rate": 7.583999999999999e-06, + "loss": 0.0008, + "num_tokens": 5058159.0, + "reward": 2.8674864768981934, + "reward_std": 0.03425685688853264, + "rewards/reward_fn/mean": 2.8674864768981934, + "rewards/reward_fn/std": 0.03425683453679085, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 256.4375, + "completions/mean_terminated_length": 256.4375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.024252223120452707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.015394567366456613, + "learning_rate": 7.5819999999999996e-06, + "loss": 0.0006, + "num_tokens": 5091741.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 99.4375, + "completions/mean_terminated_length": 99.4375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.024367709897216767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2021484375, + "kl": 0.01989277810935164, + "learning_rate": 7.5799999999999994e-06, + "loss": 0.0008, + "num_tokens": 5118763.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 236.90625, + "completions/mean_terminated_length": 236.90625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.024483196673980828, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.016911343875108287, + "learning_rate": 7.578e-06, + "loss": 0.0007, + "num_tokens": 5140328.0, + "reward": 2.8534975051879883, + "reward_std": 0.23115243017673492, + "rewards/reward_fn/mean": 2.8534975051879883, + "rewards/reward_fn/std": 0.23115241527557373, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 110.09375, + "completions/mean_terminated_length": 110.09375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.02459868345074489, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "kl": 0.028692307852907106, + "learning_rate": 7.575999999999999e-06, + "loss": 0.0011, + "num_tokens": 5157227.0, + "reward": 3.842270612716675, + "reward_std": 0.37246301770210266, + "rewards/reward_fn/mean": 3.842270612716675, + "rewards/reward_fn/std": 0.37246301770210266, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 50.9375, + "completions/mean_terminated_length": 50.9375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.02471417022750895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.022671034646918997, + "learning_rate": 7.574e-06, + "loss": 0.0009, + "num_tokens": 5173673.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 103.21875, + "completions/mean_terminated_length": 103.21875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.02482965700427301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.020398159787873738, + "learning_rate": 7.572e-06, + "loss": 0.0008, + "num_tokens": 5194256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 90.21875, + "completions/mean_terminated_length": 90.21875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.02494514378103707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.019276974111562595, + "learning_rate": 7.5699999999999995e-06, + "loss": 0.0008, + "num_tokens": 5212791.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 254.65625, + "completions/mean_terminated_length": 254.65625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.025060630557801132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.01861691998783499, + "learning_rate": 7.567999999999999e-06, + "loss": 0.0007, + "num_tokens": 5233292.0, + "reward": 2.5513319969177246, + "reward_std": 1.065507411956787, + "rewards/reward_fn/mean": 2.5513319969177246, + "rewards/reward_fn/std": 1.065507411956787, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 206.96875, + "completions/mean_terminated_length": 206.96875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.025176117334565193, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.021067319496069103, + "learning_rate": 7.565999999999999e-06, + "loss": 0.0008, + "num_tokens": 5254795.0, + "reward": 3.8499417304992676, + "reward_std": 0.5904775261878967, + "rewards/reward_fn/mean": 3.8499417304992676, + "rewards/reward_fn/std": 0.5904775261878967, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 125.5625, + "completions/mean_terminated_length": 125.5625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.025291604111329254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1630859375, + "kl": 0.027750003209803253, + "learning_rate": 7.564e-06, + "loss": 0.0011, + "num_tokens": 5283581.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 81.46875, + "completions/mean_terminated_length": 81.46875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.025407090888093314, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3125, + "kl": 0.022312811532174237, + "learning_rate": 7.562e-06, + "loss": 0.0009, + "num_tokens": 5310860.0, + "reward": 3.6014273166656494, + "reward_std": 0.2871040105819702, + "rewards/reward_fn/mean": 3.6014273166656494, + "rewards/reward_fn/std": 0.2871039807796478, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 167.15625, + "completions/mean_terminated_length": 167.15625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.025522577664857375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.02010730662732385, + "learning_rate": 7.56e-06, + "loss": 0.0008, + "num_tokens": 5331217.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 152.71875, + "completions/mean_terminated_length": 152.71875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.025638064441621436, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.022095499851275235, + "learning_rate": 7.5579999999999995e-06, + "loss": 0.0009, + "num_tokens": 5352456.0, + "reward": 3.9505839347839355, + "reward_std": 0.19445617496967316, + "rewards/reward_fn/mean": 3.9505839347839355, + "rewards/reward_fn/std": 0.19445617496967316, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 318.25, + "completions/mean_terminated_length": 318.25, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.025753551218385493, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.014035579573828727, + "learning_rate": 7.555999999999999e-06, + "loss": 0.0006, + "num_tokens": 5374448.0, + "reward": 3.9290590286254883, + "reward_std": 0.4013024866580963, + "rewards/reward_fn/mean": 3.9290590286254883, + "rewards/reward_fn/std": 0.4013024568557739, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 62.09375, + "completions/mean_terminated_length": 62.09375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.025869037995149554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1630859375, + "kl": 0.014126054076768924, + "learning_rate": 7.554e-06, + "loss": 0.0006, + "num_tokens": 5393427.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 170.84375, + "completions/mean_terminated_length": 170.84375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.025984524771913615, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.02755536389304325, + "learning_rate": 7.551999999999999e-06, + "loss": 0.0011, + "num_tokens": 5410510.0, + "reward": 3.7647318840026855, + "reward_std": 0.45780742168426514, + "rewards/reward_fn/mean": 3.7647318840026855, + "rewards/reward_fn/std": 0.45780739188194275, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 176.0625, + "completions/mean_terminated_length": 176.0625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.026100011548677676, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.015338474404416047, + "learning_rate": 7.55e-06, + "loss": 0.0006, + "num_tokens": 5433776.0, + "reward": 3.8672637939453125, + "reward_std": 0.46054545044898987, + "rewards/reward_fn/mean": 3.8672637939453125, + "rewards/reward_fn/std": 0.46054548025131226, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 188.84375, + "completions/mean_terminated_length": 188.84375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.026215498325441736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.014504563790978864, + "learning_rate": 7.5479999999999996e-06, + "loss": 0.0006, + "num_tokens": 5464651.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 173.34375, + "completions/mean_terminated_length": 173.34375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.026330985102205797, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.02530768860015087, + "learning_rate": 7.546e-06, + "loss": 0.001, + "num_tokens": 5492918.0, + "reward": 3.939840316772461, + "reward_std": 0.19037142395973206, + "rewards/reward_fn/mean": 3.939840316772461, + "rewards/reward_fn/std": 0.19037140905857086, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 309.75, + "completions/mean_terminated_length": 309.75, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.026446471878969858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.013871176284737885, + "learning_rate": 7.543999999999999e-06, + "loss": 0.0006, + "num_tokens": 5521454.0, + "reward": 3.7056844234466553, + "reward_std": 0.5423300862312317, + "rewards/reward_fn/mean": 3.7056844234466553, + "rewards/reward_fn/std": 0.5423300862312317, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1886.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 495.46875, + "completions/mean_terminated_length": 495.46875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.02656195865573392, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.014133057135040872, + "learning_rate": 7.541999999999999e-06, + "loss": 0.0006, + "num_tokens": 5550013.0, + "reward": 3.8549253940582275, + "reward_std": 0.570894718170166, + "rewards/reward_fn/mean": 3.8549253940582275, + "rewards/reward_fn/std": 0.5708946585655212, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 355.5625, + "completions/mean_terminated_length": 355.5625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.02667744543249798, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.013163079536752775, + "learning_rate": 7.54e-06, + "loss": 0.0005, + "num_tokens": 5585935.0, + "reward": 3.277529001235962, + "reward_std": 1.0887781381607056, + "rewards/reward_fn/mean": 3.277529001235962, + "rewards/reward_fn/std": 1.0887782573699951, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 252.21875, + "completions/mean_terminated_length": 252.21875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.02679293220926204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.016617214947473258, + "learning_rate": 7.538e-06, + "loss": 0.0007, + "num_tokens": 5602902.0, + "reward": 3.5810513496398926, + "reward_std": 0.7991823554039001, + "rewards/reward_fn/mean": 3.5810513496398926, + "rewards/reward_fn/std": 0.7991824150085449, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 126.53125, + "completions/mean_terminated_length": 126.53125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.0269084189860261, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.02730229278677143, + "learning_rate": 7.5359999999999995e-06, + "loss": 0.0011, + "num_tokens": 5636647.0, + "reward": 3.930551528930664, + "reward_std": 0.3928599953651428, + "rewards/reward_fn/mean": 3.930551528930664, + "rewards/reward_fn/std": 0.3928600549697876, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 227.53125, + "completions/mean_terminated_length": 227.53125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.027023905762790162, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.016814389382489026, + "learning_rate": 7.533999999999999e-06, + "loss": 0.0007, + "num_tokens": 5652824.0, + "reward": 3.83772611618042, + "reward_std": 0.5617800951004028, + "rewards/reward_fn/mean": 3.83772611618042, + "rewards/reward_fn/std": 0.5617800951004028, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 267.1875, + "completions/mean_terminated_length": 267.1875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.027139392539554223, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.01977840348263271, + "learning_rate": 7.532e-06, + "loss": 0.0008, + "num_tokens": 5680190.0, + "reward": 2.804980516433716, + "reward_std": 0.3534238040447235, + "rewards/reward_fn/mean": 2.804980516433716, + "rewards/reward_fn/std": 0.3534237742424011, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 239.59375, + "completions/mean_terminated_length": 239.59375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.027254879316318283, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.015610076079610735, + "learning_rate": 7.53e-06, + "loss": 0.0006, + "num_tokens": 5710257.0, + "reward": 3.9103341102600098, + "reward_std": 0.2416064590215683, + "rewards/reward_fn/mean": 3.9103341102600098, + "rewards/reward_fn/std": 0.2416064590215683, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 148.53125, + "completions/mean_terminated_length": 148.53125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.02737036609308234, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.02333199084387161, + "learning_rate": 7.527999999999999e-06, + "loss": 0.0009, + "num_tokens": 5726114.0, + "reward": 3.6343088150024414, + "reward_std": 0.5382227301597595, + "rewards/reward_fn/mean": 3.6343088150024414, + "rewards/reward_fn/std": 0.5382227301597595, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 378.375, + "completions/mean_terminated_length": 378.375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.0274858528698464, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.01422135895700194, + "learning_rate": 7.526e-06, + "loss": 0.0006, + "num_tokens": 5760526.0, + "reward": 2.7404513359069824, + "reward_std": 0.4138166308403015, + "rewards/reward_fn/mean": 2.7404513359069824, + "rewards/reward_fn/std": 0.4138166606426239, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 392.625, + "completions/mean_terminated_length": 392.625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.027601339646610462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.013727354424190708, + "learning_rate": 7.5239999999999995e-06, + "loss": 0.0005, + "num_tokens": 5791202.0, + "reward": 3.7214114665985107, + "reward_std": 0.7754892110824585, + "rewards/reward_fn/mean": 3.7214114665985107, + "rewards/reward_fn/std": 0.7754892110824585, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 88.3125, + "completions/mean_terminated_length": 88.3125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.027716826423374523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.019998641932033934, + "learning_rate": 7.522e-06, + "loss": 0.0008, + "num_tokens": 5806380.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 350.5625, + "completions/mean_terminated_length": 350.5625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.027832313200138584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.015723320189863443, + "learning_rate": 7.519999999999999e-06, + "loss": 0.0006, + "num_tokens": 5830078.0, + "reward": 3.71919846534729, + "reward_std": 0.7548263072967529, + "rewards/reward_fn/mean": 3.71919846534729, + "rewards/reward_fn/std": 0.7548263669013977, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 158.84375, + "completions/mean_terminated_length": 158.84375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.027947799976902644, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.016763413092121482, + "learning_rate": 7.518e-06, + "loss": 0.0007, + "num_tokens": 5854425.0, + "reward": 3.967320442199707, + "reward_std": 0.18486376106739044, + "rewards/reward_fn/mean": 3.967320442199707, + "rewards/reward_fn/std": 0.18486374616622925, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 206.40625, + "completions/mean_terminated_length": 206.40625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.028063286753666705, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.021009535354096442, + "learning_rate": 7.516e-06, + "loss": 0.0008, + "num_tokens": 5883142.0, + "reward": 3.8526430130004883, + "reward_std": 0.3514340817928314, + "rewards/reward_fn/mean": 3.8526430130004883, + "rewards/reward_fn/std": 0.3514340817928314, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 135.78125, + "completions/mean_terminated_length": 135.78125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.028178773530430766, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.017801383830374107, + "learning_rate": 7.5139999999999995e-06, + "loss": 0.0007, + "num_tokens": 5903743.0, + "reward": 3.904895782470703, + "reward_std": 0.3009895384311676, + "rewards/reward_fn/mean": 3.904895782470703, + "rewards/reward_fn/std": 0.3009895384311676, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 71.90625, + "completions/mean_terminated_length": 71.90625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.028294260307194827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23046875, + "kl": 0.024668138823471963, + "learning_rate": 7.511999999999999e-06, + "loss": 0.001, + "num_tokens": 5928252.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 370.0625, + "completions/mean_terminated_length": 370.0625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.028409747083958888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.012690110132098198, + "learning_rate": 7.509999999999999e-06, + "loss": 0.0005, + "num_tokens": 5949726.0, + "reward": 3.9334678649902344, + "reward_std": 0.37636274099349976, + "rewards/reward_fn/mean": 3.9334678649902344, + "rewards/reward_fn/std": 0.37636274099349976, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 169.1875, + "completions/mean_terminated_length": 169.1875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.02852523386072295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.028603437065612525, + "learning_rate": 7.508e-06, + "loss": 0.0011, + "num_tokens": 5973380.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 129.59375, + "completions/mean_terminated_length": 129.59375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.02864072063748701, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.734375, + "kl": 0.02140038533252664, + "learning_rate": 7.506e-06, + "loss": 0.0009, + "num_tokens": 5990967.0, + "reward": 3.929082155227661, + "reward_std": 0.4011715352535248, + "rewards/reward_fn/mean": 3.929082155227661, + "rewards/reward_fn/std": 0.4011715352535248, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 66.59375, + "completions/mean_terminated_length": 66.59375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.02875620741425107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.01585353164409753, + "learning_rate": 7.503999999999999e-06, + "loss": 0.0006, + "num_tokens": 6014890.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 91.0625, + "completions/mean_terminated_length": 91.0625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.028871694191015127, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.953125, + "kl": 0.01663297356572002, + "learning_rate": 7.5019999999999995e-06, + "loss": 0.0007, + "num_tokens": 6029708.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 258.78125, + "completions/mean_terminated_length": 258.78125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.028987180967779188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.018714004385401495, + "learning_rate": 7.499999999999999e-06, + "loss": 0.0007, + "num_tokens": 6052837.0, + "reward": 3.9325830936431885, + "reward_std": 0.26528313755989075, + "rewards/reward_fn/mean": 3.9325830936431885, + "rewards/reward_fn/std": 0.26528310775756836, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 110.46875, + "completions/mean_terminated_length": 110.46875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.02910266774454325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.019979729113401845, + "learning_rate": 7.498e-06, + "loss": 0.0008, + "num_tokens": 6068532.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 65.78125, + "completions/mean_terminated_length": 65.78125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.02921815452130731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.013501858848030679, + "learning_rate": 7.496e-06, + "loss": 0.0005, + "num_tokens": 6087501.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 70.3125, + "completions/mean_terminated_length": 70.3125, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.02933364129807137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2734375, + "kl": 0.024934177577961236, + "learning_rate": 7.494e-06, + "loss": 0.001, + "num_tokens": 6109751.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 242.1875, + "completions/mean_terminated_length": 242.1875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.02944912807483543, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.019514923420501873, + "learning_rate": 7.492e-06, + "loss": 0.0008, + "num_tokens": 6132093.0, + "reward": 3.8535568714141846, + "reward_std": 0.5762470364570618, + "rewards/reward_fn/mean": 3.8535568714141846, + "rewards/reward_fn/std": 0.576246976852417, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 65.9375, + "completions/mean_terminated_length": 65.9375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.029564614851599492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.01665993440838065, + "learning_rate": 7.49e-06, + "loss": 0.0007, + "num_tokens": 6149851.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 301.21875, + "completions/mean_terminated_length": 301.21875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.029680101628363553, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.01731674869370181, + "learning_rate": 7.488e-06, + "loss": 0.0007, + "num_tokens": 6178658.0, + "reward": 3.5597987174987793, + "reward_std": 0.7706182599067688, + "rewards/reward_fn/mean": 3.5597987174987793, + "rewards/reward_fn/std": 0.7706182599067688, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 146.1875, + "completions/mean_terminated_length": 146.1875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.029795588405127613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.015502143171033822, + "learning_rate": 7.485999999999999e-06, + "loss": 0.0006, + "num_tokens": 6207624.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 274.375, + "completions/mean_terminated_length": 274.375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.029911075181891674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.015068357373820618, + "learning_rate": 7.484e-06, + "loss": 0.0006, + "num_tokens": 6232436.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 174.65625, + "completions/mean_terminated_length": 174.65625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.030026561958655735, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.017744243814377114, + "learning_rate": 7.482e-06, + "loss": 0.0007, + "num_tokens": 6253033.0, + "reward": 3.862534523010254, + "reward_std": 0.5409870147705078, + "rewards/reward_fn/mean": 3.862534523010254, + "rewards/reward_fn/std": 0.540986955165863, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 82.15625, + "completions/mean_terminated_length": 82.15625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.030142048735419796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.011309382342005847, + "learning_rate": 7.48e-06, + "loss": 0.0005, + "num_tokens": 6278670.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 224.40625, + "completions/mean_terminated_length": 224.40625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.030257535512183856, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0194423041830305, + "learning_rate": 7.477999999999999e-06, + "loss": 0.0008, + "num_tokens": 6302011.0, + "reward": 2.9081344604492188, + "reward_std": 0.058720655739307404, + "rewards/reward_fn/mean": 2.9081344604492188, + "rewards/reward_fn/std": 0.05872063711285591, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 206.96875, + "completions/mean_terminated_length": 206.96875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.030373022288947917, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.018001933451159857, + "learning_rate": 7.475999999999999e-06, + "loss": 0.0007, + "num_tokens": 6328890.0, + "reward": 3.801466941833496, + "reward_std": 0.38339611887931824, + "rewards/reward_fn/mean": 3.801466941833496, + "rewards/reward_fn/std": 0.38339611887931824, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 173.59375, + "completions/mean_terminated_length": 173.59375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.030488509065711974, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.00977243397210259, + "learning_rate": 7.474e-06, + "loss": 0.0004, + "num_tokens": 6357357.0, + "reward": 2.770228385925293, + "reward_std": 0.04741431400179863, + "rewards/reward_fn/mean": 2.770228385925293, + "rewards/reward_fn/std": 0.047414299100637436, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 426.8125, + "completions/mean_terminated_length": 374.51611328125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.030603995842476035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7421875, + "kl": 0.01640156196663156, + "learning_rate": 7.472e-06, + "loss": 0.0007, + "num_tokens": 6394439.0, + "reward": 2.9427661895751953, + "reward_std": 0.6255316138267517, + "rewards/reward_fn/mean": 2.9427661895751953, + "rewards/reward_fn/std": 0.6255316138267517, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 268.75, + "completions/mean_terminated_length": 268.75, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.030719482619240096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.019831194367725402, + "learning_rate": 7.47e-06, + "loss": 0.0008, + "num_tokens": 6425791.0, + "reward": 2.833824872970581, + "reward_std": 0.8580666184425354, + "rewards/reward_fn/mean": 2.833824872970581, + "rewards/reward_fn/std": 0.8580665588378906, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 183.15625, + "completions/mean_terminated_length": 183.15625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.030834969396004157, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.026690719794714823, + "learning_rate": 7.4679999999999995e-06, + "loss": 0.0011, + "num_tokens": 6446340.0, + "reward": 3.857937812805176, + "reward_std": 0.559018611907959, + "rewards/reward_fn/mean": 3.857937812805176, + "rewards/reward_fn/std": 0.559018611907959, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 74.6875, + "completions/mean_terminated_length": 74.6875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.030950456172768218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.01193908372806618, + "learning_rate": 7.466e-06, + "loss": 0.0005, + "num_tokens": 6464698.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 224.21875, + "completions/mean_terminated_length": 224.21875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.03106594294953228, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.019050869916100055, + "learning_rate": 7.464e-06, + "loss": 0.0008, + "num_tokens": 6485505.0, + "reward": 3.967161178588867, + "reward_std": 0.18576456606388092, + "rewards/reward_fn/mean": 3.967161178588867, + "rewards/reward_fn/std": 0.18576455116271973, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 249.0, + "completions/mean_terminated_length": 249.0, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.03118142972629634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.01861171133350581, + "learning_rate": 7.461999999999999e-06, + "loss": 0.0007, + "num_tokens": 6511425.0, + "reward": 3.717031717300415, + "reward_std": 0.40917059779167175, + "rewards/reward_fn/mean": 3.717031717300415, + "rewards/reward_fn/std": 0.40917056798934937, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 168.53125, + "completions/mean_terminated_length": 168.53125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.0312969165030604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.019375125411897898, + "learning_rate": 7.46e-06, + "loss": 0.0008, + "num_tokens": 6528722.0, + "reward": 3.9299185276031494, + "reward_std": 0.3964402973651886, + "rewards/reward_fn/mean": 3.9299185276031494, + "rewards/reward_fn/std": 0.3964402377605438, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 178.09375, + "completions/mean_terminated_length": 178.09375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.03141240327982446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.019383833627216518, + "learning_rate": 7.4579999999999996e-06, + "loss": 0.0008, + "num_tokens": 6559797.0, + "reward": 3.9316201210021973, + "reward_std": 0.38681530952453613, + "rewards/reward_fn/mean": 3.9316201210021973, + "rewards/reward_fn/std": 0.3868153393268585, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 181.5625, + "completions/mean_terminated_length": 181.5625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.03152789005658852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.020167990995105356, + "learning_rate": 7.456e-06, + "loss": 0.0008, + "num_tokens": 6578727.0, + "reward": 3.684670925140381, + "reward_std": 0.5415733456611633, + "rewards/reward_fn/mean": 3.684670925140381, + "rewards/reward_fn/std": 0.5415733456611633, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 327.625, + "completions/mean_terminated_length": 327.625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.03164337683335258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.013769291908829473, + "learning_rate": 7.453999999999999e-06, + "loss": 0.0006, + "num_tokens": 6608059.0, + "reward": 3.8180854320526123, + "reward_std": 0.6105458736419678, + "rewards/reward_fn/mean": 3.8180854320526123, + "rewards/reward_fn/std": 0.6105458736419678, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 160.6875, + "completions/mean_terminated_length": 160.6875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.03175886361011664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.022940016599022783, + "learning_rate": 7.452e-06, + "loss": 0.0009, + "num_tokens": 6626705.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 95.625, + "completions/mean_terminated_length": 95.625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.031874350386880704, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.014759518235223368, + "learning_rate": 7.45e-06, + "loss": 0.0006, + "num_tokens": 6652005.0, + "reward": 3.965640068054199, + "reward_std": 0.10862741619348526, + "rewards/reward_fn/mean": 3.965640068054199, + "rewards/reward_fn/std": 0.10862741619348526, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 315.0625, + "completions/mean_terminated_length": 315.0625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.031989837163644765, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.014888645702740178, + "learning_rate": 7.448e-06, + "loss": 0.0006, + "num_tokens": 6677287.0, + "reward": 3.221201181411743, + "reward_std": 1.0937111377716064, + "rewards/reward_fn/mean": 3.221201181411743, + "rewards/reward_fn/std": 1.0937111377716064, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 91.09375, + "completions/mean_terminated_length": 91.09375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.032105323940408825, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.012395441706757993, + "learning_rate": 7.4459999999999995e-06, + "loss": 0.0005, + "num_tokens": 6693002.0, + "reward": 3.9690804481506348, + "reward_std": 0.1749081015586853, + "rewards/reward_fn/mean": 3.9690804481506348, + "rewards/reward_fn/std": 0.1749081015586853, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 70.8125, + "completions/mean_terminated_length": 70.8125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.032220810717172886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.251953125, + "kl": 0.02833255671430379, + "learning_rate": 7.443999999999999e-06, + "loss": 0.0011, + "num_tokens": 6713892.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 291.15625, + "completions/mean_terminated_length": 291.15625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.03233629749393695, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.023046029324177653, + "learning_rate": 7.442e-06, + "loss": 0.0009, + "num_tokens": 6741705.0, + "reward": 3.8350601196289062, + "reward_std": 0.5701581835746765, + "rewards/reward_fn/mean": 3.8350601196289062, + "rewards/reward_fn/std": 0.5701582431793213, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 95.6875, + "completions/mean_terminated_length": 95.6875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.03245178427070101, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.013699944189283997, + "learning_rate": 7.44e-06, + "loss": 0.0005, + "num_tokens": 6768063.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 113.3125, + "completions/mean_terminated_length": 113.3125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.03256727104746507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.01910720474552363, + "learning_rate": 7.438e-06, + "loss": 0.0008, + "num_tokens": 6785993.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 346.96875, + "completions/mean_terminated_length": 292.0967712402344, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.03268275782422913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.016036100989367696, + "learning_rate": 7.436e-06, + "loss": 0.0006, + "num_tokens": 6809000.0, + "reward": 3.521399736404419, + "reward_std": 0.8439139723777771, + "rewards/reward_fn/mean": 3.521399736404419, + "rewards/reward_fn/std": 0.8439139723777771, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 215.46875, + "completions/mean_terminated_length": 215.46875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.03279824460099318, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.018034519453067333, + "learning_rate": 7.4339999999999995e-06, + "loss": 0.0007, + "num_tokens": 6840151.0, + "reward": 3.7286736965179443, + "reward_std": 0.7041162252426147, + "rewards/reward_fn/mean": 3.7286736965179443, + "rewards/reward_fn/std": 0.7041162252426147, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 416.96875, + "completions/mean_terminated_length": 416.96875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.032913731377757244, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.01441318575234618, + "learning_rate": 7.432e-06, + "loss": 0.0006, + "num_tokens": 6869014.0, + "reward": 3.6467418670654297, + "reward_std": 0.9804046750068665, + "rewards/reward_fn/mean": 3.6467418670654297, + "rewards/reward_fn/std": 0.9804046154022217, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 83.375, + "completions/mean_terminated_length": 83.375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.033029218154521305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.016642103350022808, + "learning_rate": 7.429999999999999e-06, + "loss": 0.0007, + "num_tokens": 6883202.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 100.65625, + "completions/mean_terminated_length": 100.65625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.033144704931285365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.02024996464024298, + "learning_rate": 7.428e-06, + "loss": 0.0008, + "num_tokens": 6899063.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 323.9375, + "completions/mean_terminated_length": 323.9375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.033260191708049426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.01279543423152063, + "learning_rate": 7.426e-06, + "loss": 0.0005, + "num_tokens": 6925013.0, + "reward": 3.8557276725769043, + "reward_std": 0.567710816860199, + "rewards/reward_fn/mean": 3.8557276725769043, + "rewards/reward_fn/std": 0.5677107572555542, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 159.84375, + "completions/mean_terminated_length": 159.84375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.03337567848481349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.021814291234477423, + "learning_rate": 7.424e-06, + "loss": 0.0009, + "num_tokens": 6947984.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1766.0, + "completions/mean_length": 381.03125, + "completions/mean_terminated_length": 327.258056640625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.03349116526157755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7265625, + "kl": 0.017173164582345635, + "learning_rate": 7.421999999999999e-06, + "loss": 0.0007, + "num_tokens": 6972561.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 139.03125, + "completions/mean_terminated_length": 139.03125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.03360665203834161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.020997778861783445, + "learning_rate": 7.419999999999999e-06, + "loss": 0.0008, + "num_tokens": 6995442.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 277.875, + "completions/mean_terminated_length": 277.875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.03372213881510567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.014906992859323509, + "learning_rate": 7.418e-06, + "loss": 0.0006, + "num_tokens": 7015598.0, + "reward": 3.9706220626831055, + "reward_std": 0.1661863923072815, + "rewards/reward_fn/mean": 3.9706220626831055, + "rewards/reward_fn/std": 0.1661863625049591, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 262.46875, + "completions/mean_terminated_length": 262.46875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.03383762559186973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.013659992517204955, + "learning_rate": 7.416e-06, + "loss": 0.0005, + "num_tokens": 7039677.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 175.0625, + "completions/mean_terminated_length": 175.0625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.03395311236863379, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.019286645634565502, + "learning_rate": 7.414e-06, + "loss": 0.0008, + "num_tokens": 7069791.0, + "reward": 3.925260066986084, + "reward_std": 0.42279207706451416, + "rewards/reward_fn/mean": 3.925260066986084, + "rewards/reward_fn/std": 0.4227920472621918, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 75.90625, + "completions/mean_terminated_length": 75.90625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.03406859914539785, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8125, + "kl": 0.016535192415176425, + "learning_rate": 7.4119999999999995e-06, + "loss": 0.0007, + "num_tokens": 7085180.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 142.6875, + "completions/mean_terminated_length": 142.6875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.03418408592216191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.02401417409419082, + "learning_rate": 7.41e-06, + "loss": 0.001, + "num_tokens": 7113106.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 339.03125, + "completions/mean_terminated_length": 339.03125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.03429957269892597, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.019004526373464614, + "learning_rate": 7.408e-06, + "loss": 0.0008, + "num_tokens": 7135891.0, + "reward": 3.6375534534454346, + "reward_std": 0.8557912707328796, + "rewards/reward_fn/mean": 3.6375534534454346, + "rewards/reward_fn/std": 0.8557912707328796, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 208.84375, + "completions/mean_terminated_length": 208.84375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.034415059475690034, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.014341003872686997, + "learning_rate": 7.405999999999999e-06, + "loss": 0.0006, + "num_tokens": 7154702.0, + "reward": 3.358577251434326, + "reward_std": 0.7427864074707031, + "rewards/reward_fn/mean": 3.358577251434326, + "rewards/reward_fn/std": 0.7427863478660583, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 360.0625, + "completions/mean_terminated_length": 360.0625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.034530546252454095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.015499743589316495, + "learning_rate": 7.404e-06, + "loss": 0.0006, + "num_tokens": 7190608.0, + "reward": 3.894913911819458, + "reward_std": 0.4415956139564514, + "rewards/reward_fn/mean": 3.894913911819458, + "rewards/reward_fn/std": 0.44159558415412903, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 108.25, + "completions/mean_terminated_length": 108.25, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.034646033029218155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.023514228145359084, + "learning_rate": 7.402e-06, + "loss": 0.0009, + "num_tokens": 7213880.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 217.5625, + "completions/mean_terminated_length": 217.5625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.034761519805982216, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.020534433278953657, + "learning_rate": 7.4e-06, + "loss": 0.0008, + "num_tokens": 7237898.0, + "reward": 3.9753036499023438, + "reward_std": 0.13970352709293365, + "rewards/reward_fn/mean": 3.9753036499023438, + "rewards/reward_fn/std": 0.13970352709293365, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 106.71875, + "completions/mean_terminated_length": 106.71875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.03487700658274628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.01903102354845032, + "learning_rate": 7.397999999999999e-06, + "loss": 0.0008, + "num_tokens": 7254081.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 104.90625, + "completions/mean_terminated_length": 104.90625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.03499249335951034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.020694191742222756, + "learning_rate": 7.395999999999999e-06, + "loss": 0.0008, + "num_tokens": 7267038.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 190.53125, + "completions/mean_terminated_length": 190.53125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.0351079801362744, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.013844187546055764, + "learning_rate": 7.394e-06, + "loss": 0.0006, + "num_tokens": 7292399.0, + "reward": 3.81715726852417, + "reward_std": 0.3533138334751129, + "rewards/reward_fn/mean": 3.81715726852417, + "rewards/reward_fn/std": 0.35331377387046814, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 295.75, + "completions/mean_terminated_length": 295.75, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.03522346691303846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.014381561544723809, + "learning_rate": 7.392e-06, + "loss": 0.0006, + "num_tokens": 7316359.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 80.71875, + "completions/mean_terminated_length": 80.71875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.03533895368980252, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.025253614847315475, + "learning_rate": 7.3899999999999995e-06, + "loss": 0.001, + "num_tokens": 7337150.0, + "reward": 3.379918098449707, + "reward_std": 0.03159104287624359, + "rewards/reward_fn/mean": 3.379918098449707, + "rewards/reward_fn/std": 0.031591057777404785, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 137.03125, + "completions/mean_terminated_length": 137.03125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.03545444046656658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.020041497409692965, + "learning_rate": 7.387999999999999e-06, + "loss": 0.0008, + "num_tokens": 7359199.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 185.84375, + "completions/mean_terminated_length": 185.84375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.03556992724333064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.016654927312629297, + "learning_rate": 7.386e-06, + "loss": 0.0007, + "num_tokens": 7381114.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 365.21875, + "completions/mean_terminated_length": 365.21875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.0356854140200947, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.018344701296882704, + "learning_rate": 7.384e-06, + "loss": 0.0007, + "num_tokens": 7415169.0, + "reward": 3.4894561767578125, + "reward_std": 0.9280169010162354, + "rewards/reward_fn/mean": 3.4894561767578125, + "rewards/reward_fn/std": 0.9280168414115906, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 106.4375, + "completions/mean_terminated_length": 106.4375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.03580090079685876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.01205358628067188, + "learning_rate": 7.381999999999999e-06, + "loss": 0.0005, + "num_tokens": 7441711.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 92.3125, + "completions/mean_terminated_length": 92.3125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.03591638757362282, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.02096794125100132, + "learning_rate": 7.38e-06, + "loss": 0.0008, + "num_tokens": 7456569.0, + "reward": 3.2906060218811035, + "reward_std": 0.02338166907429695, + "rewards/reward_fn/mean": 3.2906060218811035, + "rewards/reward_fn/std": 0.0233816709369421, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 218.1875, + "completions/mean_terminated_length": 218.1875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.03603187435038688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.015831449010875076, + "learning_rate": 7.3779999999999995e-06, + "loss": 0.0006, + "num_tokens": 7479583.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 233.5625, + "completions/mean_terminated_length": 233.5625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.03614736112715094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.018866984668420628, + "learning_rate": 7.376e-06, + "loss": 0.0008, + "num_tokens": 7499537.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 221.34375, + "completions/mean_terminated_length": 221.34375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.036262847903915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.01454188654315658, + "learning_rate": 7.373999999999999e-06, + "loss": 0.0006, + "num_tokens": 7526236.0, + "reward": 3.7051398754119873, + "reward_std": 0.4808679521083832, + "rewards/reward_fn/mean": 3.7051398754119873, + "rewards/reward_fn/std": 0.4808679521083832, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 338.375, + "completions/mean_terminated_length": 338.375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.03637833468067906, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.018224038474727422, + "learning_rate": 7.372e-06, + "loss": 0.0007, + "num_tokens": 7548136.0, + "reward": 3.581106185913086, + "reward_std": 0.8860512971878052, + "rewards/reward_fn/mean": 3.581106185913086, + "rewards/reward_fn/std": 0.8860512375831604, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 65.09375, + "completions/mean_terminated_length": 65.09375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.03649382145744312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.017071369496989064, + "learning_rate": 7.37e-06, + "loss": 0.0007, + "num_tokens": 7562667.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 117.5625, + "completions/mean_terminated_length": 117.5625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.03660930823420718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.019728886763914488, + "learning_rate": 7.368e-06, + "loss": 0.0008, + "num_tokens": 7584765.0, + "reward": 3.969188690185547, + "reward_std": 0.17429427802562714, + "rewards/reward_fn/mean": 3.969188690185547, + "rewards/reward_fn/std": 0.17429429292678833, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 81.21875, + "completions/mean_terminated_length": 81.21875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.03672479501097124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.189453125, + "kl": 0.018942095412057824, + "learning_rate": 7.3659999999999994e-06, + "loss": 0.0008, + "num_tokens": 7606276.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 346.96875, + "completions/mean_terminated_length": 346.96875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.0368402817877353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.015121824806556106, + "learning_rate": 7.363999999999999e-06, + "loss": 0.0006, + "num_tokens": 7632387.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 142.75, + "completions/mean_terminated_length": 142.75, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.036955768564499364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.008967340130766388, + "learning_rate": 7.362e-06, + "loss": 0.0004, + "num_tokens": 7653083.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 73.40625, + "completions/mean_terminated_length": 73.40625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.037071255341263425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1953125, + "kl": 0.020387535521876998, + "learning_rate": 7.36e-06, + "loss": 0.0008, + "num_tokens": 7664168.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 230.75, + "completions/mean_terminated_length": 230.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.037186742118027485, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.02422497622319497, + "learning_rate": 7.358e-06, + "loss": 0.001, + "num_tokens": 7694272.0, + "reward": 3.0399770736694336, + "reward_std": 0.5694229602813721, + "rewards/reward_fn/mean": 3.0399770736694336, + "rewards/reward_fn/std": 0.5694229602813721, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 55.0625, + "completions/mean_terminated_length": 55.0625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.037302228894791546, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.21875, + "kl": 0.022887554674525745, + "learning_rate": 7.3559999999999995e-06, + "loss": 0.0009, + "num_tokens": 7714946.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 80.5625, + "completions/mean_terminated_length": 80.5625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.03741771567155561, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.01105003418524575, + "learning_rate": 7.353999999999999e-06, + "loss": 0.0004, + "num_tokens": 7729780.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 344.8125, + "completions/mean_terminated_length": 289.8709716796875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.03753320244831967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.69140625, + "kl": 0.019092890332103707, + "learning_rate": 7.352e-06, + "loss": 0.0008, + "num_tokens": 7764846.0, + "reward": 3.9254257678985596, + "reward_std": 0.42185530066490173, + "rewards/reward_fn/mean": 3.9254257678985596, + "rewards/reward_fn/std": 0.42185530066490173, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 207.71875, + "completions/mean_terminated_length": 207.71875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.03764868922508373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.014704134984640405, + "learning_rate": 7.349999999999999e-06, + "loss": 0.0006, + "num_tokens": 7794149.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 75.4375, + "completions/mean_terminated_length": 75.4375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.03776417600184779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.008938628205214627, + "learning_rate": 7.348e-06, + "loss": 0.0004, + "num_tokens": 7826835.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 90.21875, + "completions/mean_terminated_length": 90.21875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.03787966277861185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.017686752922600135, + "learning_rate": 7.346e-06, + "loss": 0.0007, + "num_tokens": 7853402.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 154.34375, + "completions/mean_terminated_length": 154.34375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.03799514955537591, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.02248095627874136, + "learning_rate": 7.344e-06, + "loss": 0.0009, + "num_tokens": 7880229.0, + "reward": 3.330822467803955, + "reward_std": 0.178675577044487, + "rewards/reward_fn/mean": 3.330822467803955, + "rewards/reward_fn/std": 0.17867562174797058, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 73.96875, + "completions/mean_terminated_length": 73.96875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.03811063633213997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7578125, + "kl": 0.0328890937671531, + "learning_rate": 7.341999999999999e-06, + "loss": 0.0013, + "num_tokens": 7891940.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 203.34375, + "completions/mean_terminated_length": 203.34375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.03822612310890403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.018224672880023718, + "learning_rate": 7.339999999999999e-06, + "loss": 0.0007, + "num_tokens": 7915151.0, + "reward": 3.1657559871673584, + "reward_std": 0.6152005791664124, + "rewards/reward_fn/mean": 3.1657559871673584, + "rewards/reward_fn/std": 0.6152005791664124, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 271.71875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.03834160988566809, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.020497378456639126, + "learning_rate": 7.338e-06, + "loss": 0.0008, + "num_tokens": 7936454.0, + "reward": 3.929274320602417, + "reward_std": 0.4000852704048157, + "rewards/reward_fn/mean": 3.929274320602417, + "rewards/reward_fn/std": 0.4000852704048157, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 358.78125, + "completions/mean_terminated_length": 358.78125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.038457096662432154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.014500286866677925, + "learning_rate": 7.336e-06, + "loss": 0.0006, + "num_tokens": 7970143.0, + "reward": 3.670651435852051, + "reward_std": 0.5447534322738647, + "rewards/reward_fn/mean": 3.670651435852051, + "rewards/reward_fn/std": 0.5447534322738647, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 356.21875, + "completions/mean_terminated_length": 356.21875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.038572583439196215, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.01378821644175332, + "learning_rate": 7.334e-06, + "loss": 0.0006, + "num_tokens": 8005062.0, + "reward": 3.9281060695648193, + "reward_std": 0.40669316053390503, + "rewards/reward_fn/mean": 3.9281060695648193, + "rewards/reward_fn/std": 0.40669313073158264, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 216.21875, + "completions/mean_terminated_length": 216.21875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.038688070215960275, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.018450963427312672, + "learning_rate": 7.3319999999999994e-06, + "loss": 0.0007, + "num_tokens": 8033517.0, + "reward": 3.8098082542419434, + "reward_std": 0.4068799614906311, + "rewards/reward_fn/mean": 3.8098082542419434, + "rewards/reward_fn/std": 0.4068799614906311, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 238.09375, + "completions/mean_terminated_length": 238.09375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.038803556992724336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.021997255855239928, + "learning_rate": 7.33e-06, + "loss": 0.0009, + "num_tokens": 8058096.0, + "reward": 3.9177937507629395, + "reward_std": 0.26470571756362915, + "rewards/reward_fn/mean": 3.9177937507629395, + "rewards/reward_fn/std": 0.26470568776130676, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 249.28125, + "completions/mean_terminated_length": 249.28125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.0389190437694884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.016587822130532004, + "learning_rate": 7.328e-06, + "loss": 0.0007, + "num_tokens": 8089337.0, + "reward": 3.8871850967407227, + "reward_std": 0.35756316781044006, + "rewards/reward_fn/mean": 3.8871850967407227, + "rewards/reward_fn/std": 0.35756316781044006, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 159.90625, + "completions/mean_terminated_length": 159.90625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.03903453054625245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.01121224495000206, + "learning_rate": 7.325999999999999e-06, + "loss": 0.0004, + "num_tokens": 8105398.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 241.21875, + "completions/mean_terminated_length": 241.21875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.03915001732301651, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.017287559050600976, + "learning_rate": 7.324e-06, + "loss": 0.0007, + "num_tokens": 8125917.0, + "reward": 2.930148124694824, + "reward_std": 0.23257240653038025, + "rewards/reward_fn/mean": 2.930148124694824, + "rewards/reward_fn/std": 0.23257243633270264, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 80.25, + "completions/mean_terminated_length": 80.25, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.03926550409978057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2275390625, + "kl": 0.022334694280289114, + "learning_rate": 7.3219999999999995e-06, + "loss": 0.0009, + "num_tokens": 8142053.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 90.0625, + "completions/mean_terminated_length": 90.0625, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.03938099087654463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11669921875, + "kl": 0.014665011556644458, + "learning_rate": 7.32e-06, + "loss": 0.0006, + "num_tokens": 8161063.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 152.84375, + "completions/mean_terminated_length": 152.84375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.039496477653308694, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.014915794832631946, + "learning_rate": 7.317999999999999e-06, + "loss": 0.0006, + "num_tokens": 8184962.0, + "reward": 3.967909812927246, + "reward_std": 0.18152868747711182, + "rewards/reward_fn/mean": 3.967909812927246, + "rewards/reward_fn/std": 0.18152867257595062, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 128.0625, + "completions/mean_terminated_length": 128.0625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.039611964430072755, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.0396674518706277, + "learning_rate": 7.316e-06, + "loss": 0.0016, + "num_tokens": 8208292.0, + "reward": 3.5163278579711914, + "reward_std": 0.46199649572372437, + "rewards/reward_fn/mean": 3.5163278579711914, + "rewards/reward_fn/std": 0.46199652552604675, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 89.21875, + "completions/mean_terminated_length": 89.21875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.039727451206836815, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.031824853911530226, + "learning_rate": 7.314e-06, + "loss": 0.0013, + "num_tokens": 8229355.0, + "reward": 3.9398040771484375, + "reward_std": 0.2405519038438797, + "rewards/reward_fn/mean": 3.9398040771484375, + "rewards/reward_fn/std": 0.2405519187450409, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 102.625, + "completions/mean_terminated_length": 102.625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.039842937983600876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.008578728808060987, + "learning_rate": 7.312e-06, + "loss": 0.0003, + "num_tokens": 8251007.0, + "reward": 3.93219256401062, + "reward_std": 0.3835771083831787, + "rewards/reward_fn/mean": 3.93219256401062, + "rewards/reward_fn/std": 0.3835770785808563, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 167.40625, + "completions/mean_terminated_length": 167.40625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.03995842476036494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.022791170369600877, + "learning_rate": 7.3099999999999995e-06, + "loss": 0.0009, + "num_tokens": 8268172.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 181.78125, + "completions/mean_terminated_length": 181.78125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.040073911537129, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.014354248298332095, + "learning_rate": 7.307999999999999e-06, + "loss": 0.0006, + "num_tokens": 8286917.0, + "reward": 3.8030152320861816, + "reward_std": 0.5885424613952637, + "rewards/reward_fn/mean": 3.8030152320861816, + "rewards/reward_fn/std": 0.5885424613952637, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 163.125, + "completions/mean_terminated_length": 163.125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.04018939831389306, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.026504864043090492, + "learning_rate": 7.306e-06, + "loss": 0.0011, + "num_tokens": 8304841.0, + "reward": 3.930339813232422, + "reward_std": 0.22222881019115448, + "rewards/reward_fn/mean": 3.930339813232422, + "rewards/reward_fn/std": 0.2222287952899933, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 327.28125, + "completions/mean_terminated_length": 327.28125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.04030488509065712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.014190797592164017, + "learning_rate": 7.304e-06, + "loss": 0.0006, + "num_tokens": 8329298.0, + "reward": 3.0363006591796875, + "reward_std": 0.5665045976638794, + "rewards/reward_fn/mean": 3.0363006591796875, + "rewards/reward_fn/std": 0.5665045976638794, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 128.34375, + "completions/mean_terminated_length": 128.34375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.04042037186742118, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.015855034376727417, + "learning_rate": 7.301999999999999e-06, + "loss": 0.0006, + "num_tokens": 8349949.0, + "reward": 3.9761617183685303, + "reward_std": 0.13484981656074524, + "rewards/reward_fn/mean": 3.9761617183685303, + "rewards/reward_fn/std": 0.13484981656074524, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 316.21875, + "completions/mean_terminated_length": 316.21875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.04053585864418524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.01628568374144379, + "learning_rate": 7.2999999999999996e-06, + "loss": 0.0007, + "num_tokens": 8370596.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 166.40625, + "completions/mean_terminated_length": 166.40625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.0406513454209493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28125, + "kl": 0.024442071837256663, + "learning_rate": 7.297999999999999e-06, + "loss": 0.001, + "num_tokens": 8397969.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 67.0625, + "completions/mean_terminated_length": 67.0625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.04076683219771336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.01129712884358014, + "learning_rate": 7.296e-06, + "loss": 0.0005, + "num_tokens": 8424979.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 86.65625, + "completions/mean_terminated_length": 86.65625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.04088231897447742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.01109286343853455, + "learning_rate": 7.293999999999999e-06, + "loss": 0.0004, + "num_tokens": 8440680.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 251.65625, + "completions/mean_terminated_length": 251.65625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.040997805751241484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.01423029498255346, + "learning_rate": 7.292e-06, + "loss": 0.0006, + "num_tokens": 8463837.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 172.5, + "completions/mean_terminated_length": 172.5, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.041113292528005545, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.022261171689024195, + "learning_rate": 7.29e-06, + "loss": 0.0009, + "num_tokens": 8492397.0, + "reward": 3.412478446960449, + "reward_std": 0.4141022264957428, + "rewards/reward_fn/mean": 3.412478446960449, + "rewards/reward_fn/std": 0.4141022861003876, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 103.90625, + "completions/mean_terminated_length": 103.90625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.041228779304769606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.013696319336304441, + "learning_rate": 7.2879999999999995e-06, + "loss": 0.0005, + "num_tokens": 8508714.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 128.65625, + "completions/mean_terminated_length": 128.65625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.041344266081533666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.01919619570253417, + "learning_rate": 7.285999999999999e-06, + "loss": 0.0008, + "num_tokens": 8525535.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 127.09375, + "completions/mean_terminated_length": 127.09375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.04145975285829773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.012437685960321687, + "learning_rate": 7.283999999999999e-06, + "loss": 0.0005, + "num_tokens": 8541986.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 104.78125, + "completions/mean_terminated_length": 104.78125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.04157523963506179, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.016581193063757382, + "learning_rate": 7.282e-06, + "loss": 0.0007, + "num_tokens": 8564379.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 154.875, + "completions/mean_terminated_length": 154.875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.04169072641182585, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.021843138558324426, + "learning_rate": 7.28e-06, + "loss": 0.0009, + "num_tokens": 8577335.0, + "reward": 3.6208925247192383, + "reward_std": 0.5330928564071655, + "rewards/reward_fn/mean": 3.6208925247192383, + "rewards/reward_fn/std": 0.5330927968025208, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 338.34375, + "completions/mean_terminated_length": 338.34375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.04180621318858991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.01385333202779293, + "learning_rate": 7.278e-06, + "loss": 0.0006, + "num_tokens": 8597762.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 302.375, + "completions/mean_terminated_length": 302.375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.04192169996535397, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.01431081906775944, + "learning_rate": 7.2759999999999995e-06, + "loss": 0.0006, + "num_tokens": 8623086.0, + "reward": 3.9308419227600098, + "reward_std": 0.39121660590171814, + "rewards/reward_fn/mean": 3.9308419227600098, + "rewards/reward_fn/std": 0.39121660590171814, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 64.6875, + "completions/mean_terminated_length": 64.6875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.04203718674211803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.012174010669696145, + "learning_rate": 7.273999999999999e-06, + "loss": 0.0005, + "num_tokens": 8649412.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 290.75, + "completions/mean_terminated_length": 290.75, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.042152673518882085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.0140914834337309, + "learning_rate": 7.272e-06, + "loss": 0.0006, + "num_tokens": 8674076.0, + "reward": 3.929488182067871, + "reward_std": 0.3988751769065857, + "rewards/reward_fn/mean": 3.929488182067871, + "rewards/reward_fn/std": 0.3988751471042633, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 62.53125, + "completions/mean_terminated_length": 62.53125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.042268160295646146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2236328125, + "kl": 0.01697751735628117, + "learning_rate": 7.269999999999999e-06, + "loss": 0.0007, + "num_tokens": 8703437.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 81.0625, + "completions/mean_terminated_length": 81.0625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.042383647072410206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.012210331195092294, + "learning_rate": 7.268e-06, + "loss": 0.0005, + "num_tokens": 8713903.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 302.25, + "completions/mean_terminated_length": 245.9354705810547, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.04249913384917427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "kl": 0.01274511961673852, + "learning_rate": 7.2659999999999996e-06, + "loss": 0.0005, + "num_tokens": 8735895.0, + "reward": 3.6232903003692627, + "reward_std": 0.7644832134246826, + "rewards/reward_fn/mean": 3.6232903003692627, + "rewards/reward_fn/std": 0.7644832134246826, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 343.375, + "completions/mean_terminated_length": 343.375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.04261462062593833, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.010925795038929209, + "learning_rate": 7.264e-06, + "loss": 0.0004, + "num_tokens": 8762371.0, + "reward": 3.5008151531219482, + "reward_std": 0.9585660696029663, + "rewards/reward_fn/mean": 3.5008151531219482, + "rewards/reward_fn/std": 0.9585660696029663, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 95.3125, + "completions/mean_terminated_length": 95.3125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.04273010740270239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.236328125, + "kl": 0.031696687161456794, + "learning_rate": 7.261999999999999e-06, + "loss": 0.0013, + "num_tokens": 8777741.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 240.84375, + "completions/mean_terminated_length": 240.84375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.04284559417946645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.014319513487862423, + "learning_rate": 7.259999999999999e-06, + "loss": 0.0006, + "num_tokens": 8815624.0, + "reward": 3.87386155128479, + "reward_std": 0.3430371880531311, + "rewards/reward_fn/mean": 3.87386155128479, + "rewards/reward_fn/std": 0.3430371582508087, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 218.5, + "completions/mean_terminated_length": 218.5, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.04296108095623051, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.014259859177400358, + "learning_rate": 7.258e-06, + "loss": 0.0006, + "num_tokens": 8844824.0, + "reward": 3.5248916149139404, + "reward_std": 0.8566362857818604, + "rewards/reward_fn/mean": 3.5248916149139404, + "rewards/reward_fn/std": 0.8566362857818604, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 142.40625, + "completions/mean_terminated_length": 142.40625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.04307656773299457, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.016684046830050647, + "learning_rate": 7.256e-06, + "loss": 0.0007, + "num_tokens": 8871621.0, + "reward": 3.9830873012542725, + "reward_std": 0.0956723615527153, + "rewards/reward_fn/mean": 3.9830873012542725, + "rewards/reward_fn/std": 0.0956723764538765, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 306.78125, + "completions/mean_terminated_length": 306.78125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.04319205450975863, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.019671654794365168, + "learning_rate": 7.2539999999999995e-06, + "loss": 0.0008, + "num_tokens": 8901278.0, + "reward": 3.0079922676086426, + "reward_std": 0.7683718204498291, + "rewards/reward_fn/mean": 3.0079922676086426, + "rewards/reward_fn/std": 0.7683718204498291, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 241.875, + "completions/mean_terminated_length": 241.875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.04330754128652269, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.015615090174833313, + "learning_rate": 7.251999999999999e-06, + "loss": 0.0006, + "num_tokens": 8931706.0, + "reward": 2.7088260650634766, + "reward_std": 0.2754212021827698, + "rewards/reward_fn/mean": 2.7088260650634766, + "rewards/reward_fn/std": 0.275421142578125, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 125.125, + "completions/mean_terminated_length": 125.125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.04342302806328675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.011044395010685548, + "learning_rate": 7.25e-06, + "loss": 0.0004, + "num_tokens": 8947902.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 200.5625, + "completions/mean_terminated_length": 200.5625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.043538514840050814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.011620019518886693, + "learning_rate": 7.248e-06, + "loss": 0.0005, + "num_tokens": 8972688.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 148.5625, + "completions/mean_terminated_length": 148.5625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.043654001616814875, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.017997743692831136, + "learning_rate": 7.246e-06, + "loss": 0.0007, + "num_tokens": 8990018.0, + "reward": 3.929063320159912, + "reward_std": 0.4012782871723175, + "rewards/reward_fn/mean": 3.929063320159912, + "rewards/reward_fn/std": 0.4012782871723175, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 139.90625, + "completions/mean_terminated_length": 139.90625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.043769488393578936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.024018996773520485, + "learning_rate": 7.244e-06, + "loss": 0.001, + "num_tokens": 9022047.0, + "reward": 3.9418082237243652, + "reward_std": 0.2290171980857849, + "rewards/reward_fn/mean": 3.9418082237243652, + "rewards/reward_fn/std": 0.2290172278881073, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 91.03125, + "completions/mean_terminated_length": 91.03125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.043884975170342996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.013463827803207096, + "learning_rate": 7.2419999999999994e-06, + "loss": 0.0005, + "num_tokens": 9034464.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 180.90625, + "completions/mean_terminated_length": 180.90625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.04400046194710706, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.01626200249302201, + "learning_rate": 7.24e-06, + "loss": 0.0007, + "num_tokens": 9058973.0, + "reward": 3.7902088165283203, + "reward_std": 0.4762765169143677, + "rewards/reward_fn/mean": 3.7902088165283203, + "rewards/reward_fn/std": 0.4762765169143677, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 217.75, + "completions/mean_terminated_length": 158.7096710205078, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.04411594872387112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.458984375, + "kl": 0.016611884617304895, + "learning_rate": 7.238e-06, + "loss": 0.0007, + "num_tokens": 9090613.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 100.96875, + "completions/mean_terminated_length": 100.96875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.04423143550063518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.02043562322796788, + "learning_rate": 7.236e-06, + "loss": 0.0008, + "num_tokens": 9110260.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 167.3125, + "completions/mean_terminated_length": 167.3125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.04434692227739924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.02271213018684648, + "learning_rate": 7.234e-06, + "loss": 0.0009, + "num_tokens": 9134910.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 319.9375, + "completions/mean_terminated_length": 319.9375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.0444624090541633, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.018067770171910524, + "learning_rate": 7.2319999999999995e-06, + "loss": 0.0007, + "num_tokens": 9167964.0, + "reward": 3.2758889198303223, + "reward_std": 0.8786913752555847, + "rewards/reward_fn/mean": 3.2758889198303223, + "rewards/reward_fn/std": 0.8786913752555847, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 200.40625, + "completions/mean_terminated_length": 200.40625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.04457789583092736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.012279920745640993, + "learning_rate": 7.23e-06, + "loss": 0.0005, + "num_tokens": 9189417.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 85.3125, + "completions/mean_terminated_length": 85.3125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.04469338260769142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.016171434202988166, + "learning_rate": 7.227999999999999e-06, + "loss": 0.0006, + "num_tokens": 9209587.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 72.4375, + "completions/mean_terminated_length": 72.4375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.04480886938445548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.010418821715575177, + "learning_rate": 7.226e-06, + "loss": 0.0004, + "num_tokens": 9233089.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 237.625, + "completions/mean_terminated_length": 237.625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.04492435616121954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.0159747830912238, + "learning_rate": 7.224e-06, + "loss": 0.0006, + "num_tokens": 9249205.0, + "reward": 3.5321638584136963, + "reward_std": 0.7696741819381714, + "rewards/reward_fn/mean": 3.5321638584136963, + "rewards/reward_fn/std": 0.7696741819381714, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 376.5625, + "completions/mean_terminated_length": 322.6451416015625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.045039842937983604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.013945658065495081, + "learning_rate": 7.2220000000000005e-06, + "loss": 0.0006, + "num_tokens": 9276359.0, + "reward": 3.7317984104156494, + "reward_std": 0.8835108876228333, + "rewards/reward_fn/mean": 3.7317984104156494, + "rewards/reward_fn/std": 0.8835108280181885, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 50.3125, + "completions/mean_terminated_length": 50.3125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.045155329714747665, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1875, + "kl": 0.013943939433374908, + "learning_rate": 7.2199999999999995e-06, + "loss": 0.0006, + "num_tokens": 9299857.0, + "reward": 3.625, + "reward_std": 1.1845782995224, + "rewards/reward_fn/mean": 3.625, + "rewards/reward_fn/std": 1.1845782995224, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 373.53125, + "completions/mean_terminated_length": 373.53125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.04527081649151172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.0126413383113686, + "learning_rate": 7.217999999999999e-06, + "loss": 0.0005, + "num_tokens": 9323938.0, + "reward": 3.4893245697021484, + "reward_std": 0.9808948636054993, + "rewards/reward_fn/mean": 3.4893245697021484, + "rewards/reward_fn/std": 0.9808948636054993, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 240.53125, + "completions/mean_terminated_length": 240.53125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.04538630326827578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.01648494799155742, + "learning_rate": 7.216e-06, + "loss": 0.0007, + "num_tokens": 9347603.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 261.25, + "completions/mean_terminated_length": 261.25, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.04550179004503984, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.013168747333111241, + "learning_rate": 7.214e-06, + "loss": 0.0005, + "num_tokens": 9375963.0, + "reward": 3.7133941650390625, + "reward_std": 0.6878663301467896, + "rewards/reward_fn/mean": 3.7133941650390625, + "rewards/reward_fn/std": 0.6878663301467896, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 255.5, + "completions/mean_terminated_length": 255.5, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.0456172768218039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.017290541960392147, + "learning_rate": 7.212e-06, + "loss": 0.0007, + "num_tokens": 9396043.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 232.84375, + "completions/mean_terminated_length": 232.84375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.04573276359856796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.013509357740986161, + "learning_rate": 7.21e-06, + "loss": 0.0005, + "num_tokens": 9413094.0, + "reward": 3.9308524131774902, + "reward_std": 0.3911578059196472, + "rewards/reward_fn/mean": 3.9308524131774902, + "rewards/reward_fn/std": 0.39115774631500244, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 95.0, + "completions/mean_terminated_length": 95.0, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.04584825037533202, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.01438189373948262, + "learning_rate": 7.208e-06, + "loss": 0.0006, + "num_tokens": 9439046.0, + "reward": 2.947575569152832, + "reward_std": 0.06682678312063217, + "rewards/reward_fn/mean": 2.947575569152832, + "rewards/reward_fn/std": 0.06682678312063217, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 329.53125, + "completions/mean_terminated_length": 329.53125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.04596373715209608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.015411907559609972, + "learning_rate": 7.206e-06, + "loss": 0.0006, + "num_tokens": 9465751.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 210.78125, + "completions/mean_terminated_length": 210.78125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.046079223928860144, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.015391836655908264, + "learning_rate": 7.203999999999999e-06, + "loss": 0.0006, + "num_tokens": 9489904.0, + "reward": 3.926591157913208, + "reward_std": 0.41526344418525696, + "rewards/reward_fn/mean": 3.926591157913208, + "rewards/reward_fn/std": 0.41526347398757935, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 149.21875, + "completions/mean_terminated_length": 149.21875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.046194710705624205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.013993661283166148, + "learning_rate": 7.202e-06, + "loss": 0.0006, + "num_tokens": 9506167.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 206.03125, + "completions/mean_terminated_length": 206.03125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.046310197482388266, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.023727795894956216, + "learning_rate": 7.2e-06, + "loss": 0.0009, + "num_tokens": 9533720.0, + "reward": 3.0530810356140137, + "reward_std": 0.08688167482614517, + "rewards/reward_fn/mean": 3.0530810356140137, + "rewards/reward_fn/std": 0.08688168227672577, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 227.1875, + "completions/mean_terminated_length": 227.1875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.046425684259152326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.019487187935737893, + "learning_rate": 7.198e-06, + "loss": 0.0008, + "num_tokens": 9553438.0, + "reward": 3.300380229949951, + "reward_std": 1.0545986890792847, + "rewards/reward_fn/mean": 3.300380229949951, + "rewards/reward_fn/std": 1.0545986890792847, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.0, + "completions/max_terminated_length": 100.0, + "completions/mean_length": 67.0, + "completions/mean_terminated_length": 67.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.04654117103591639, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.01260414783610031, + "learning_rate": 7.195999999999999e-06, + "loss": 0.0005, + "num_tokens": 9567934.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 151.96875, + "completions/mean_terminated_length": 151.96875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.04665665781268045, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.01862855048966594, + "learning_rate": 7.193999999999999e-06, + "loss": 0.0007, + "num_tokens": 9591357.0, + "reward": 3.4720869064331055, + "reward_std": 0.4748080372810364, + "rewards/reward_fn/mean": 3.4720869064331055, + "rewards/reward_fn/std": 0.4748080372810364, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 192.59375, + "completions/mean_terminated_length": 192.59375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.04677214458944451, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.024708818789804354, + "learning_rate": 7.192e-06, + "loss": 0.001, + "num_tokens": 9609840.0, + "reward": 3.744049072265625, + "reward_std": 0.6203041672706604, + "rewards/reward_fn/mean": 3.744049072265625, + "rewards/reward_fn/std": 0.6203042268753052, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 195.75, + "completions/mean_terminated_length": 195.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.04688763136620857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.02118657683604397, + "learning_rate": 7.19e-06, + "loss": 0.0008, + "num_tokens": 9628744.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 130.1875, + "completions/mean_terminated_length": 130.1875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.04700311814297263, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.014864540207781829, + "learning_rate": 7.188e-06, + "loss": 0.0006, + "num_tokens": 9646286.0, + "reward": 3.9119713306427, + "reward_std": 0.238542839884758, + "rewards/reward_fn/mean": 3.9119713306427, + "rewards/reward_fn/std": 0.2385428249835968, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 99.84375, + "completions/mean_terminated_length": 99.84375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.04711860491973669, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.020700045803096145, + "learning_rate": 7.1859999999999995e-06, + "loss": 0.0008, + "num_tokens": 9667497.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 172.28125, + "completions/mean_terminated_length": 172.28125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.04723409169650075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.016727116642869078, + "learning_rate": 7.184e-06, + "loss": 0.0007, + "num_tokens": 9686738.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 128.15625, + "completions/mean_terminated_length": 128.15625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.04734957847326481, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.01630168427072931, + "learning_rate": 7.182e-06, + "loss": 0.0007, + "num_tokens": 9708983.0, + "reward": 3.97406005859375, + "reward_std": 0.1467391550540924, + "rewards/reward_fn/mean": 3.97406005859375, + "rewards/reward_fn/std": 0.1467391848564148, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 167.03125, + "completions/mean_terminated_length": 167.03125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.04746506525002887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.01596589488326572, + "learning_rate": 7.179999999999999e-06, + "loss": 0.0006, + "num_tokens": 9728696.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 70.4375, + "completions/mean_terminated_length": 70.4375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.047580552026792934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.008525786011887249, + "learning_rate": 7.178e-06, + "loss": 0.0003, + "num_tokens": 9746630.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 141.28125, + "completions/mean_terminated_length": 141.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.047696038803556995, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.012704452310572378, + "learning_rate": 7.1759999999999996e-06, + "loss": 0.0005, + "num_tokens": 9775311.0, + "reward": 3.010441303253174, + "reward_std": 0.29302406311035156, + "rewards/reward_fn/mean": 3.010441303253174, + "rewards/reward_fn/std": 0.29302406311035156, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 197.75, + "completions/mean_terminated_length": 197.75, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.047811525580321056, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.013776170613709837, + "learning_rate": 7.174e-06, + "loss": 0.0006, + "num_tokens": 9794119.0, + "reward": 3.0242695808410645, + "reward_std": 0.524551510810852, + "rewards/reward_fn/mean": 3.0242695808410645, + "rewards/reward_fn/std": 0.524551510810852, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 281.21875, + "completions/mean_terminated_length": 281.21875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.047927012357085116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.011081444550654851, + "learning_rate": 7.171999999999999e-06, + "loss": 0.0004, + "num_tokens": 9818350.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 314.21875, + "completions/mean_terminated_length": 314.21875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.04804249913384918, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.012679003077209927, + "learning_rate": 7.17e-06, + "loss": 0.0005, + "num_tokens": 9853141.0, + "reward": 3.583848476409912, + "reward_std": 0.8344935774803162, + "rewards/reward_fn/mean": 3.583848476409912, + "rewards/reward_fn/std": 0.8344935774803162, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 57.40625, + "completions/mean_terminated_length": 57.40625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.04815798591061324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.01280318171120598, + "learning_rate": 7.168e-06, + "loss": 0.0005, + "num_tokens": 9876354.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 202.65625, + "completions/mean_terminated_length": 202.65625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.0482734726873773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.024114042258588597, + "learning_rate": 7.166e-06, + "loss": 0.001, + "num_tokens": 9903671.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 90.875, + "completions/mean_terminated_length": 90.875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.04838895946414135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.01124746475397842, + "learning_rate": 7.1639999999999995e-06, + "loss": 0.0004, + "num_tokens": 9914419.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 366.34375, + "completions/mean_terminated_length": 366.34375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.04850444624090541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048583984375, + "kl": 0.012494038717704825, + "learning_rate": 7.161999999999999e-06, + "loss": 0.0005, + "num_tokens": 9935838.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 159.90625, + "completions/mean_terminated_length": 159.90625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.048619933017669474, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.014734933763975278, + "learning_rate": 7.16e-06, + "loss": 0.0006, + "num_tokens": 9963547.0, + "reward": 3.7241764068603516, + "reward_std": 0.36208513379096985, + "rewards/reward_fn/mean": 3.7241764068603516, + "rewards/reward_fn/std": 0.36208510398864746, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 291.9375, + "completions/mean_terminated_length": 235.29031372070312, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.048735419794433535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.014524794190947432, + "learning_rate": 7.158e-06, + "loss": 0.0006, + "num_tokens": 9990905.0, + "reward": 3.722508430480957, + "reward_std": 0.8231249451637268, + "rewards/reward_fn/mean": 3.722508430480957, + "rewards/reward_fn/std": 0.8231249451637268, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 213.625, + "completions/mean_terminated_length": 213.625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.048850906571197596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.016897749970667064, + "learning_rate": 7.156e-06, + "loss": 0.0007, + "num_tokens": 10011885.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 325.25, + "completions/mean_terminated_length": 325.25, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.048966393347961656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.018373530940152705, + "learning_rate": 7.154e-06, + "loss": 0.0007, + "num_tokens": 10042133.0, + "reward": 3.5505101680755615, + "reward_std": 0.725200355052948, + "rewards/reward_fn/mean": 3.5505101680755615, + "rewards/reward_fn/std": 0.7252002954483032, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 109.53125, + "completions/mean_terminated_length": 109.53125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.04908188012472572, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.022904285026015714, + "learning_rate": 7.1519999999999995e-06, + "loss": 0.0009, + "num_tokens": 10063558.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 286.6875, + "completions/mean_terminated_length": 286.6875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.04919736690148978, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.013823103596223518, + "learning_rate": 7.15e-06, + "loss": 0.0006, + "num_tokens": 10094364.0, + "reward": 2.703886032104492, + "reward_std": 0.588105320930481, + "rewards/reward_fn/mean": 2.703886032104492, + "rewards/reward_fn/std": 0.588105320930481, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 154.28125, + "completions/mean_terminated_length": 154.28125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.04931285367825384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.02078727982006967, + "learning_rate": 7.147999999999999e-06, + "loss": 0.0008, + "num_tokens": 10111269.0, + "reward": 3.902130603790283, + "reward_std": 0.41872137784957886, + "rewards/reward_fn/mean": 3.902130603790283, + "rewards/reward_fn/std": 0.41872137784957886, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 78.40625, + "completions/mean_terminated_length": 78.40625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.0494283404550179, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "kl": 0.016537673509446904, + "learning_rate": 7.146e-06, + "loss": 0.0007, + "num_tokens": 10129426.0, + "reward": 3.9645540714263916, + "reward_std": 0.20051251351833344, + "rewards/reward_fn/mean": 3.9645540714263916, + "rewards/reward_fn/std": 0.20051254332065582, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 99.03125, + "completions/mean_terminated_length": 99.03125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.04954382723178196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.013878548648790456, + "learning_rate": 7.144e-06, + "loss": 0.0006, + "num_tokens": 10144947.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 304.5, + "completions/mean_terminated_length": 304.5, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.04965931400854602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.014581637500668876, + "learning_rate": 7.142e-06, + "loss": 0.0006, + "num_tokens": 10164963.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 228.28125, + "completions/mean_terminated_length": 228.28125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.04977480078531008, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.017348826077068225, + "learning_rate": 7.139999999999999e-06, + "loss": 0.0007, + "num_tokens": 10194476.0, + "reward": 2.93521785736084, + "reward_std": 0.5329893827438354, + "rewards/reward_fn/mean": 2.93521785736084, + "rewards/reward_fn/std": 0.5329893827438354, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 97.0625, + "completions/mean_terminated_length": 97.0625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.04989028756207414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.008961345651187003, + "learning_rate": 7.137999999999999e-06, + "loss": 0.0004, + "num_tokens": 10207982.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 191.96875, + "completions/mean_terminated_length": 191.96875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.0500057743388382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296875, + "kl": 0.006546734810399357, + "learning_rate": 7.136e-06, + "loss": 0.0003, + "num_tokens": 10238669.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 132.25, + "completions/mean_terminated_length": 132.25, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.050121261115602264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.018257138566696085, + "learning_rate": 7.134e-06, + "loss": 0.0007, + "num_tokens": 10254197.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 56.8125, + "completions/mean_terminated_length": 56.8125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.050236747892366325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.013563794156652875, + "learning_rate": 7.132e-06, + "loss": 0.0005, + "num_tokens": 10266575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 105.03125, + "completions/mean_terminated_length": 105.03125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.050352234669130386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.01445709215477109, + "learning_rate": 7.1299999999999995e-06, + "loss": 0.0006, + "num_tokens": 10284016.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 85.34375, + "completions/mean_terminated_length": 85.34375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.050467721445894446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.007474855359760113, + "learning_rate": 7.128e-06, + "loss": 0.0003, + "num_tokens": 10303067.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 158.90625, + "completions/mean_terminated_length": 158.90625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.05058320822265851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.01588945671392139, + "learning_rate": 7.126e-06, + "loss": 0.0006, + "num_tokens": 10325784.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 94.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 60.875, + "completions/mean_terminated_length": 60.875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.05069869499942257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.015210721947369166, + "learning_rate": 7.123999999999999e-06, + "loss": 0.0006, + "num_tokens": 10344468.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 375.84375, + "completions/mean_terminated_length": 375.84375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.05081418177618663, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.012402691732859239, + "learning_rate": 7.122e-06, + "loss": 0.0005, + "num_tokens": 10378671.0, + "reward": 2.903026819229126, + "reward_std": 0.48146092891693115, + "rewards/reward_fn/mean": 2.903026819229126, + "rewards/reward_fn/std": 0.48146089911460876, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 245.3125, + "completions/mean_terminated_length": 245.3125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.05092966855295069, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.010914407175732777, + "learning_rate": 7.12e-06, + "loss": 0.0004, + "num_tokens": 10402009.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 253.125, + "completions/mean_terminated_length": 253.125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.05104515532971475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.01387399218219798, + "learning_rate": 7.118e-06, + "loss": 0.0006, + "num_tokens": 10424893.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 275.25, + "completions/mean_terminated_length": 275.25, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.05116064210647881, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.017766479082638398, + "learning_rate": 7.115999999999999e-06, + "loss": 0.0007, + "num_tokens": 10446469.0, + "reward": 3.9311487674713135, + "reward_std": 0.38948163390159607, + "rewards/reward_fn/mean": 3.9311487674713135, + "rewards/reward_fn/std": 0.38948163390159607, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 91.5, + "completions/mean_terminated_length": 91.5, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.05127612888324287, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.859375, + "kl": 0.014660945977084339, + "learning_rate": 7.113999999999999e-06, + "loss": 0.0006, + "num_tokens": 10471125.0, + "reward": 3.8030753135681152, + "reward_std": 0.41666778922080994, + "rewards/reward_fn/mean": 3.8030753135681152, + "rewards/reward_fn/std": 0.41666775941848755, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 125.8125, + "completions/mean_terminated_length": 125.8125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.05139161566000693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.023040831452817656, + "learning_rate": 7.112e-06, + "loss": 0.0009, + "num_tokens": 10502703.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 175.25, + "completions/mean_terminated_length": 175.25, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.051507102436770986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.013633181035402231, + "learning_rate": 7.11e-06, + "loss": 0.0005, + "num_tokens": 10530199.0, + "reward": 3.0563771724700928, + "reward_std": 0.55732661485672, + "rewards/reward_fn/mean": 3.0563771724700928, + "rewards/reward_fn/std": 0.55732661485672, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 142.40625, + "completions/mean_terminated_length": 142.40625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.05162258921353505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.016291582302073948, + "learning_rate": 7.1079999999999995e-06, + "loss": 0.0007, + "num_tokens": 10554788.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 74.5, + "completions/mean_terminated_length": 74.5, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.05173807599029911, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.014934393744624685, + "learning_rate": 7.105999999999999e-06, + "loss": 0.0006, + "num_tokens": 10576788.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 277.1875, + "completions/mean_terminated_length": 277.1875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.05185356276706317, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.013695397195988335, + "learning_rate": 7.104e-06, + "loss": 0.0005, + "num_tokens": 10604026.0, + "reward": 3.8060779571533203, + "reward_std": 0.512506902217865, + "rewards/reward_fn/mean": 3.8060779571533203, + "rewards/reward_fn/std": 0.5125069618225098, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 172.90625, + "completions/mean_terminated_length": 172.90625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.05196904954382723, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.017695833841571584, + "learning_rate": 7.102e-06, + "loss": 0.0007, + "num_tokens": 10621367.0, + "reward": 3.4445438385009766, + "reward_std": 0.9777275919914246, + "rewards/reward_fn/mean": 3.4445438385009766, + "rewards/reward_fn/std": 0.977727472782135, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 149.59375, + "completions/mean_terminated_length": 149.59375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.05208453632059129, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.01631888948031701, + "learning_rate": 7.099999999999999e-06, + "loss": 0.0007, + "num_tokens": 10641194.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 66.90625, + "completions/mean_terminated_length": 66.90625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.05220002309735535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.158203125, + "kl": 0.01918772755016107, + "learning_rate": 7.098e-06, + "loss": 0.0008, + "num_tokens": 10661575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 347.8125, + "completions/mean_terminated_length": 347.8125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.05231550987411941, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.010221972945146263, + "learning_rate": 7.0959999999999995e-06, + "loss": 0.0004, + "num_tokens": 10695233.0, + "reward": 3.2999963760375977, + "reward_std": 0.5005695223808289, + "rewards/reward_fn/mean": 3.2999963760375977, + "rewards/reward_fn/std": 0.5005695223808289, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 83.25, + "completions/mean_terminated_length": 83.25, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.05243099665088347, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.703125, + "kl": 0.011682098200253677, + "learning_rate": 7.094e-06, + "loss": 0.0005, + "num_tokens": 10710921.0, + "reward": 3.8441238403320312, + "reward_std": 0.7228685021400452, + "rewards/reward_fn/mean": 3.8441238403320312, + "rewards/reward_fn/std": 0.7228685021400452, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 249.5625, + "completions/mean_terminated_length": 249.5625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.05254648342764753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.010656792015652172, + "learning_rate": 7.091999999999999e-06, + "loss": 0.0004, + "num_tokens": 10735003.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 291.3125, + "completions/mean_terminated_length": 291.3125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.052661970204411594, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.010581067865132354, + "learning_rate": 7.09e-06, + "loss": 0.0004, + "num_tokens": 10761829.0, + "reward": 3.727781295776367, + "reward_std": 0.5499487519264221, + "rewards/reward_fn/mean": 3.727781295776367, + "rewards/reward_fn/std": 0.5499487519264221, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 139.96875, + "completions/mean_terminated_length": 139.96875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.052777456981175655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.017095128379878588, + "learning_rate": 7.088e-06, + "loss": 0.0007, + "num_tokens": 10779940.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 313.40625, + "completions/mean_terminated_length": 313.40625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.052892943757939716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.012503948295488954, + "learning_rate": 7.086e-06, + "loss": 0.0005, + "num_tokens": 10817585.0, + "reward": 3.786060333251953, + "reward_std": 0.6758123636245728, + "rewards/reward_fn/mean": 3.786060333251953, + "rewards/reward_fn/std": 0.675812304019928, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 77.5, + "completions/mean_terminated_length": 77.5, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.053008430534703777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.00782165825512493, + "learning_rate": 7.0839999999999994e-06, + "loss": 0.0003, + "num_tokens": 10832385.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 56.71875, + "completions/mean_terminated_length": 56.71875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.05312391731146784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2392578125, + "kl": 0.02003132719619316, + "learning_rate": 7.081999999999999e-06, + "loss": 0.0008, + "num_tokens": 10847544.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 177.40625, + "completions/mean_terminated_length": 177.40625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.0532394040882319, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.014051743506570347, + "learning_rate": 7.08e-06, + "loss": 0.0006, + "num_tokens": 10869925.0, + "reward": 3.9359707832336426, + "reward_std": 0.25851911306381226, + "rewards/reward_fn/mean": 3.9359707832336426, + "rewards/reward_fn/std": 0.25851908326148987, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 263.9375, + "completions/mean_terminated_length": 263.9375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.05335489086499596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05859375, + "kl": 0.013913654038333334, + "learning_rate": 7.078e-06, + "loss": 0.0006, + "num_tokens": 10893187.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 268.46875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.05347037764176002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.025174270645948127, + "learning_rate": 7.076e-06, + "loss": 0.001, + "num_tokens": 10914450.0, + "reward": 3.7911086082458496, + "reward_std": 0.6599133014678955, + "rewards/reward_fn/mean": 3.7911086082458496, + "rewards/reward_fn/std": 0.6599133014678955, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 337.84375, + "completions/mean_terminated_length": 337.84375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.05358586441852408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.01228907953191083, + "learning_rate": 7.0739999999999995e-06, + "loss": 0.0005, + "num_tokens": 10941357.0, + "reward": 3.927277088165283, + "reward_std": 0.4113825559616089, + "rewards/reward_fn/mean": 3.927277088165283, + "rewards/reward_fn/std": 0.4113825261592865, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 246.4375, + "completions/mean_terminated_length": 246.4375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.05370135119528814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.022118185355793685, + "learning_rate": 7.071999999999999e-06, + "loss": 0.0009, + "num_tokens": 10963195.0, + "reward": 3.914492607116699, + "reward_std": 0.2717793583869934, + "rewards/reward_fn/mean": 3.914492607116699, + "rewards/reward_fn/std": 0.2717793583869934, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 115.21875, + "completions/mean_terminated_length": 115.21875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.0538168379720522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.019113596063107252, + "learning_rate": 7.07e-06, + "loss": 0.0008, + "num_tokens": 10983458.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 78.34375, + "completions/mean_terminated_length": 78.34375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.05393232474881626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.012007688483208767, + "learning_rate": 7.067999999999999e-06, + "loss": 0.0005, + "num_tokens": 11003085.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 251.4375, + "completions/mean_terminated_length": 251.4375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.054047811525580323, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.01856064674211666, + "learning_rate": 7.066e-06, + "loss": 0.0007, + "num_tokens": 11034523.0, + "reward": 3.617964267730713, + "reward_std": 0.6208584904670715, + "rewards/reward_fn/mean": 3.617964267730713, + "rewards/reward_fn/std": 0.6208584904670715, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 76.0, + "completions/mean_terminated_length": 76.0, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.054163298302344384, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1611328125, + "kl": 0.016953302540059667, + "learning_rate": 7.064e-06, + "loss": 0.0007, + "num_tokens": 11046459.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 146.8125, + "completions/mean_terminated_length": 146.8125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.054278785079108445, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.703125, + "kl": 0.01595947904570494, + "learning_rate": 7.062e-06, + "loss": 0.0006, + "num_tokens": 11063605.0, + "reward": 3.928513526916504, + "reward_std": 0.40438786149024963, + "rewards/reward_fn/mean": 3.928513526916504, + "rewards/reward_fn/std": 0.40438786149024963, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 233.65625, + "completions/mean_terminated_length": 233.65625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.054394271855872506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.012540741576231085, + "learning_rate": 7.059999999999999e-06, + "loss": 0.0005, + "num_tokens": 11081034.0, + "reward": 3.8928191661834717, + "reward_std": 0.448201984167099, + "rewards/reward_fn/mean": 3.8928191661834717, + "rewards/reward_fn/std": 0.448201984167099, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 332.125, + "completions/mean_terminated_length": 332.125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.05450975863263657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.014220582117559388, + "learning_rate": 7.057999999999999e-06, + "loss": 0.0006, + "num_tokens": 11104622.0, + "reward": 3.788461446762085, + "reward_std": 0.6682384610176086, + "rewards/reward_fn/mean": 3.788461446762085, + "rewards/reward_fn/std": 0.6682384014129639, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 195.21875, + "completions/mean_terminated_length": 195.21875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.05462524540940062, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.016305267170537263, + "learning_rate": 7.056e-06, + "loss": 0.0007, + "num_tokens": 11128565.0, + "reward": 3.867530345916748, + "reward_std": 0.4607440233230591, + "rewards/reward_fn/mean": 3.867530345916748, + "rewards/reward_fn/std": 0.4607439935207367, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 91.09375, + "completions/mean_terminated_length": 91.09375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.05474073218616468, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.012345580867986428, + "learning_rate": 7.054e-06, + "loss": 0.0005, + "num_tokens": 11145400.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 216.59375, + "completions/mean_terminated_length": 216.59375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.05485621896292874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.01063501437602099, + "learning_rate": 7.0519999999999996e-06, + "loss": 0.0004, + "num_tokens": 11167979.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 393.3125, + "completions/mean_terminated_length": 393.3125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.0549717057396928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.012673015196924098, + "learning_rate": 7.049999999999999e-06, + "loss": 0.0005, + "num_tokens": 11203733.0, + "reward": 3.8939366340637207, + "reward_std": 0.450479120016098, + "rewards/reward_fn/mean": 3.8939366340637207, + "rewards/reward_fn/std": 0.4504791498184204, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 142.75, + "completions/mean_terminated_length": 142.75, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.055087192516456863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.010973096315865405, + "learning_rate": 7.048e-06, + "loss": 0.0004, + "num_tokens": 11221997.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 147.4375, + "completions/mean_terminated_length": 147.4375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.055202679293220924, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.009136733029663446, + "learning_rate": 7.046e-06, + "loss": 0.0004, + "num_tokens": 11251387.0, + "reward": 3.791884183883667, + "reward_std": 0.4980187714099884, + "rewards/reward_fn/mean": 3.791884183883667, + "rewards/reward_fn/std": 0.4980187714099884, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 137.53125, + "completions/mean_terminated_length": 137.53125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.055318166069984985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.016142700202181004, + "learning_rate": 7.043999999999999e-06, + "loss": 0.0006, + "num_tokens": 11267948.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 213.78125, + "completions/mean_terminated_length": 213.78125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.055433652846749046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.01715540025907103, + "learning_rate": 7.042e-06, + "loss": 0.0007, + "num_tokens": 11286821.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 144.53125, + "completions/mean_terminated_length": 144.53125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.05554913962351311, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.01656286374782212, + "learning_rate": 7.0399999999999995e-06, + "loss": 0.0007, + "num_tokens": 11309718.0, + "reward": 3.2857699394226074, + "reward_std": 0.07743456214666367, + "rewards/reward_fn/mean": 3.2857699394226074, + "rewards/reward_fn/std": 0.07743457704782486, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 985.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 452.25, + "completions/mean_terminated_length": 452.25, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.05566462640027717, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.012987825437448919, + "learning_rate": 7.038e-06, + "loss": 0.0005, + "num_tokens": 11343902.0, + "reward": 2.8080155849456787, + "reward_std": 0.49147385358810425, + "rewards/reward_fn/mean": 2.8080155849456787, + "rewards/reward_fn/std": 0.49147382378578186, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 180.9375, + "completions/mean_terminated_length": 180.9375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.05578011317704123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.01797349698608741, + "learning_rate": 7.035999999999999e-06, + "loss": 0.0007, + "num_tokens": 11371292.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 321.34375, + "completions/mean_terminated_length": 321.34375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.05589559995380529, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.012888979341369122, + "learning_rate": 7.034e-06, + "loss": 0.0005, + "num_tokens": 11393191.0, + "reward": 3.926039218902588, + "reward_std": 0.4183851182460785, + "rewards/reward_fn/mean": 3.926039218902588, + "rewards/reward_fn/std": 0.4183851182460785, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 220.4375, + "completions/mean_terminated_length": 220.4375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.05601108673056935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.013198564323829487, + "learning_rate": 7.032e-06, + "loss": 0.0005, + "num_tokens": 11412149.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 210.84375, + "completions/mean_terminated_length": 210.84375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.05612657350733341, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.01350961952994112, + "learning_rate": 7.03e-06, + "loss": 0.0005, + "num_tokens": 11441360.0, + "reward": 3.9680733680725098, + "reward_std": 0.18060512840747833, + "rewards/reward_fn/mean": 3.9680733680725098, + "rewards/reward_fn/std": 0.18060512840747833, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 268.125, + "completions/mean_terminated_length": 268.125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.05624206028409747, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.013799087551888078, + "learning_rate": 7.0279999999999995e-06, + "loss": 0.0006, + "num_tokens": 11465588.0, + "reward": 3.9308760166168213, + "reward_std": 0.39102426171302795, + "rewards/reward_fn/mean": 3.9308760166168213, + "rewards/reward_fn/std": 0.39102426171302795, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 259.75, + "completions/mean_terminated_length": 259.75, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.05635754706086153, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.016873140557436273, + "learning_rate": 7.025999999999999e-06, + "loss": 0.0007, + "num_tokens": 11499564.0, + "reward": 3.8745360374450684, + "reward_std": 0.2999594807624817, + "rewards/reward_fn/mean": 3.8745360374450684, + "rewards/reward_fn/std": 0.2999595105648041, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 276.625, + "completions/mean_terminated_length": 276.625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.05647303383762559, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.016035863023716956, + "learning_rate": 7.024e-06, + "loss": 0.0006, + "num_tokens": 11527008.0, + "reward": 3.5638961791992188, + "reward_std": 0.8226156830787659, + "rewards/reward_fn/mean": 3.5638961791992188, + "rewards/reward_fn/std": 0.8226156830787659, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 105.0625, + "completions/mean_terminated_length": 105.0625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.056588520614389654, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.017001760497805662, + "learning_rate": 7.022e-06, + "loss": 0.0007, + "num_tokens": 11542626.0, + "reward": 3.940070629119873, + "reward_std": 0.2358221560716629, + "rewards/reward_fn/mean": 3.940070629119873, + "rewards/reward_fn/std": 0.2358221411705017, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 108.21875, + "completions/mean_terminated_length": 108.21875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.056704007391153714, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.014117752398306038, + "learning_rate": 7.019999999999999e-06, + "loss": 0.0006, + "num_tokens": 11568297.0, + "reward": 3.855457305908203, + "reward_std": 0.3419138193130493, + "rewards/reward_fn/mean": 3.855457305908203, + "rewards/reward_fn/std": 0.3419138491153717, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 287.875, + "completions/mean_terminated_length": 287.875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.056819494167917775, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.012607541488250718, + "learning_rate": 7.0179999999999996e-06, + "loss": 0.0005, + "num_tokens": 11589861.0, + "reward": 3.926321029663086, + "reward_std": 0.41679081320762634, + "rewards/reward_fn/mean": 3.926321029663086, + "rewards/reward_fn/std": 0.41679075360298157, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 179.71875, + "completions/mean_terminated_length": 179.71875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.056934980944681836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.014474307958153076, + "learning_rate": 7.015999999999999e-06, + "loss": 0.0006, + "num_tokens": 11608348.0, + "reward": 3.3014888763427734, + "reward_std": 0.312743604183197, + "rewards/reward_fn/mean": 3.3014888763427734, + "rewards/reward_fn/std": 0.31274357438087463, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 280.21875, + "completions/mean_terminated_length": 280.21875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.0570504677214459, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.014082789362873882, + "learning_rate": 7.014e-06, + "loss": 0.0006, + "num_tokens": 11636515.0, + "reward": 3.6107289791107178, + "reward_std": 0.4206605553627014, + "rewards/reward_fn/mean": 3.6107289791107178, + "rewards/reward_fn/std": 0.4206605851650238, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 164.03125, + "completions/mean_terminated_length": 164.03125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.05716595449820996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.0138043640909018, + "learning_rate": 7.011999999999999e-06, + "loss": 0.0006, + "num_tokens": 11662148.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 130.9375, + "completions/mean_terminated_length": 130.9375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.05728144127497402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.007488663366530091, + "learning_rate": 7.01e-06, + "loss": 0.0003, + "num_tokens": 11680514.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 102.03125, + "completions/mean_terminated_length": 102.03125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.05739692805173808, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.02030332367576193, + "learning_rate": 7.008e-06, + "loss": 0.0008, + "num_tokens": 11694691.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 124.90625, + "completions/mean_terminated_length": 124.90625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.05751241482850214, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.032924249331699684, + "learning_rate": 7.0059999999999995e-06, + "loss": 0.0013, + "num_tokens": 11709536.0, + "reward": 3.8988699913024902, + "reward_std": 0.4222148060798645, + "rewards/reward_fn/mean": 3.8988699913024902, + "rewards/reward_fn/std": 0.4222147762775421, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 211.71875, + "completions/mean_terminated_length": 211.71875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.0576279016052662, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.027832976571517065, + "learning_rate": 7.003999999999999e-06, + "loss": 0.0011, + "num_tokens": 11740023.0, + "reward": 3.1460533142089844, + "reward_std": 0.5437058806419373, + "rewards/reward_fn/mean": 3.1460533142089844, + "rewards/reward_fn/std": 0.5437058210372925, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 57.0625, + "completions/mean_terminated_length": 57.0625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.057743388382030254, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.25, + "kl": 0.017788823875889648, + "learning_rate": 7.001999999999999e-06, + "loss": 0.0007, + "num_tokens": 11755353.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 84.5, + "completions/mean_terminated_length": 84.5, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.057858875158794315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.019832735226373188, + "learning_rate": 7e-06, + "loss": 0.0008, + "num_tokens": 11778409.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 109.875, + "completions/mean_terminated_length": 109.875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.057974361935558376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.007558900488220388, + "learning_rate": 6.998e-06, + "loss": 0.0003, + "num_tokens": 11798341.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 134.96875, + "completions/mean_terminated_length": 134.96875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.05808984871232244, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.015705443045590073, + "learning_rate": 6.9960000000000004e-06, + "loss": 0.0006, + "num_tokens": 11810436.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 210.71875, + "completions/mean_terminated_length": 210.71875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.0582053354890865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.01511598113575019, + "learning_rate": 6.9939999999999994e-06, + "loss": 0.0006, + "num_tokens": 11829947.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 275.25, + "completions/mean_terminated_length": 275.25, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.05832082226585056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.021158855292014778, + "learning_rate": 6.991999999999999e-06, + "loss": 0.0008, + "num_tokens": 11858691.0, + "reward": 3.626426935195923, + "reward_std": 0.6399580836296082, + "rewards/reward_fn/mean": 3.626426935195923, + "rewards/reward_fn/std": 0.6399580240249634, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 175.34375, + "completions/mean_terminated_length": 175.34375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.05843630904261462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.02040002660942264, + "learning_rate": 6.99e-06, + "loss": 0.0008, + "num_tokens": 11886158.0, + "reward": 3.9697389602661133, + "reward_std": 0.17118309438228607, + "rewards/reward_fn/mean": 3.9697389602661133, + "rewards/reward_fn/std": 0.17118309438228607, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1560.0, + "completions/mean_length": 619.0, + "completions/mean_terminated_length": 523.7333374023438, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.05855179581937868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.013686523932847194, + "learning_rate": 6.988e-06, + "loss": 0.0005, + "num_tokens": 11925294.0, + "reward": 2.88761568069458, + "reward_std": 1.0984435081481934, + "rewards/reward_fn/mean": 2.88761568069458, + "rewards/reward_fn/std": 1.0984435081481934, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 220.75, + "completions/mean_terminated_length": 161.8064422607422, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.05866728259614274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.016947900759987533, + "learning_rate": 6.986e-06, + "loss": 0.0007, + "num_tokens": 11944774.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 261.03125, + "completions/mean_terminated_length": 261.03125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.0587827693729068, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.013468218763591722, + "learning_rate": 6.9839999999999995e-06, + "loss": 0.0005, + "num_tokens": 11966375.0, + "reward": 3.169464588165283, + "reward_std": 0.9693487286567688, + "rewards/reward_fn/mean": 3.169464588165283, + "rewards/reward_fn/std": 0.969348669052124, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 118.375, + "completions/mean_terminated_length": 118.375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.05889825614967086, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.012632753976504318, + "learning_rate": 6.982e-06, + "loss": 0.0005, + "num_tokens": 11995315.0, + "reward": 3.8385791778564453, + "reward_std": 0.3418509364128113, + "rewards/reward_fn/mean": 3.8385791778564453, + "rewards/reward_fn/std": 0.3418509364128113, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 197.71875, + "completions/mean_terminated_length": 197.71875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.05901374292643492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.01675421465188265, + "learning_rate": 6.98e-06, + "loss": 0.0007, + "num_tokens": 12023978.0, + "reward": 3.7866616249084473, + "reward_std": 0.6215641498565674, + "rewards/reward_fn/mean": 3.7866616249084473, + "rewards/reward_fn/std": 0.6215641498565674, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 85.09375, + "completions/mean_terminated_length": 85.09375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.059129229703198984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1865234375, + "kl": 0.02149993927741889, + "learning_rate": 6.977999999999999e-06, + "loss": 0.0009, + "num_tokens": 12053517.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 132.78125, + "completions/mean_terminated_length": 132.78125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.059244716479963044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11572265625, + "kl": 0.01955039775930345, + "learning_rate": 6.976e-06, + "loss": 0.0008, + "num_tokens": 12067366.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 170.78125, + "completions/mean_terminated_length": 170.78125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.059360203256727105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1708984375, + "kl": 0.017008961920510046, + "learning_rate": 6.974e-06, + "loss": 0.0007, + "num_tokens": 12083743.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 237.1875, + "completions/mean_terminated_length": 237.1875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.059475690033491166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.016588899146881886, + "learning_rate": 6.972e-06, + "loss": 0.0007, + "num_tokens": 12103653.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 227.8125, + "completions/mean_terminated_length": 227.8125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.05959117681025523, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.02051679097348824, + "learning_rate": 6.969999999999999e-06, + "loss": 0.0008, + "num_tokens": 12135455.0, + "reward": 3.8209950923919678, + "reward_std": 0.34412986040115356, + "rewards/reward_fn/mean": 3.8209950923919678, + "rewards/reward_fn/std": 0.3441298305988312, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 59.65625, + "completions/mean_terminated_length": 59.65625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.05970666358701929, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.01749833051871974, + "learning_rate": 6.968e-06, + "loss": 0.0007, + "num_tokens": 12148468.0, + "reward": 3.9757447242736816, + "reward_std": 0.13720804452896118, + "rewards/reward_fn/mean": 3.9757447242736816, + "rewards/reward_fn/std": 0.13720804452896118, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 70.375, + "completions/mean_terminated_length": 70.375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.05982215036378335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.010547147270699497, + "learning_rate": 6.966e-06, + "loss": 0.0004, + "num_tokens": 12176224.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 228.9375, + "completions/mean_terminated_length": 228.9375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.05993763714054741, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.010616500658215955, + "learning_rate": 6.964e-06, + "loss": 0.0004, + "num_tokens": 12202078.0, + "reward": 3.9274282455444336, + "reward_std": 0.41052699089050293, + "rewards/reward_fn/mean": 3.9274282455444336, + "rewards/reward_fn/std": 0.41052699089050293, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 192.96875, + "completions/mean_terminated_length": 192.96875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.06005312391731147, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.024617151386337355, + "learning_rate": 6.962e-06, + "loss": 0.001, + "num_tokens": 12227005.0, + "reward": 3.3080894947052, + "reward_std": 0.6809019446372986, + "rewards/reward_fn/mean": 3.3080894947052, + "rewards/reward_fn/std": 0.6809019446372986, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 297.84375, + "completions/mean_terminated_length": 297.84375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.06016861069407553, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.01090382434631465, + "learning_rate": 6.9599999999999994e-06, + "loss": 0.0004, + "num_tokens": 12259672.0, + "reward": 3.6483206748962402, + "reward_std": 0.8304150700569153, + "rewards/reward_fn/mean": 3.6483206748962402, + "rewards/reward_fn/std": 0.8304150700569153, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.06028409747083959, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.012250970845343545, + "learning_rate": 6.958e-06, + "loss": 0.0005, + "num_tokens": 12278172.0, + "reward": 3.2819247245788574, + "reward_std": 0.4060117304325104, + "rewards/reward_fn/mean": 3.2819247245788574, + "rewards/reward_fn/std": 0.40601176023483276, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 186.59375, + "completions/mean_terminated_length": 186.59375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.06039958424760365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.014600050912122242, + "learning_rate": 6.956e-06, + "loss": 0.0006, + "num_tokens": 12299503.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 83.46875, + "completions/mean_terminated_length": 83.46875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.06051507102436771, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.011119921829958912, + "learning_rate": 6.954e-06, + "loss": 0.0004, + "num_tokens": 12317214.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 336.21875, + "completions/mean_terminated_length": 336.21875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.060630557801131774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.014544936857419088, + "learning_rate": 6.952e-06, + "loss": 0.0006, + "num_tokens": 12339557.0, + "reward": 3.853440761566162, + "reward_std": 0.5767353773117065, + "rewards/reward_fn/mean": 3.853440761566162, + "rewards/reward_fn/std": 0.5767353773117065, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 76.34375, + "completions/mean_terminated_length": 76.34375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.060746044577895834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1591796875, + "kl": 0.013601378494058736, + "learning_rate": 6.9499999999999995e-06, + "loss": 0.0005, + "num_tokens": 12355696.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 133.375, + "completions/mean_terminated_length": 133.375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.06086153135465989, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.012577030443935655, + "learning_rate": 6.948e-06, + "loss": 0.0005, + "num_tokens": 12371612.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 132.84375, + "completions/mean_terminated_length": 132.84375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.06097701813142395, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.020203502019285224, + "learning_rate": 6.945999999999999e-06, + "loss": 0.0008, + "num_tokens": 12385495.0, + "reward": 3.0589797496795654, + "reward_std": 0.32714223861694336, + "rewards/reward_fn/mean": 3.0589797496795654, + "rewards/reward_fn/std": 0.32714226841926575, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 198.875, + "completions/mean_terminated_length": 198.875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.06109250490818801, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.01830424030777067, + "learning_rate": 6.944e-06, + "loss": 0.0007, + "num_tokens": 12414739.0, + "reward": 3.950789213180542, + "reward_std": 0.19383372366428375, + "rewards/reward_fn/mean": 3.950789213180542, + "rewards/reward_fn/std": 0.19383369386196136, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 217.25, + "completions/mean_terminated_length": 217.25, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.06120799168495207, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05712890625, + "kl": 0.013238994564744644, + "learning_rate": 6.942e-06, + "loss": 0.0005, + "num_tokens": 12435227.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 122.28125, + "completions/mean_terminated_length": 122.28125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.06132347846171613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.02304468686634209, + "learning_rate": 6.9400000000000005e-06, + "loss": 0.0009, + "num_tokens": 12460260.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 399.34375, + "completions/mean_terminated_length": 399.34375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.06143896523848019, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.016632261351333, + "learning_rate": 6.9379999999999995e-06, + "loss": 0.0007, + "num_tokens": 12488143.0, + "reward": 3.927182674407959, + "reward_std": 0.41191667318344116, + "rewards/reward_fn/mean": 3.927182674407959, + "rewards/reward_fn/std": 0.41191667318344116, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 202.1875, + "completions/mean_terminated_length": 202.1875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.06155445201524425, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.016976595477899536, + "learning_rate": 6.935999999999999e-06, + "loss": 0.0007, + "num_tokens": 12513621.0, + "reward": 3.0300025939941406, + "reward_std": 0.27452999353408813, + "rewards/reward_fn/mean": 3.0300025939941406, + "rewards/reward_fn/std": 0.2745300233364105, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 105.03125, + "completions/mean_terminated_length": 105.03125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.061669938792008314, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.02865500151528977, + "learning_rate": 6.934e-06, + "loss": 0.0011, + "num_tokens": 12530038.0, + "reward": 3.9688498973846436, + "reward_std": 0.12320633232593536, + "rewards/reward_fn/mean": 3.9688498973846436, + "rewards/reward_fn/std": 0.12320629507303238, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 96.09375, + "completions/mean_terminated_length": 96.09375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.061785425568772374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.01394811131467577, + "learning_rate": 6.932e-06, + "loss": 0.0006, + "num_tokens": 12551321.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 102.21875, + "completions/mean_terminated_length": 102.21875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.061900912345536435, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.021773886284790933, + "learning_rate": 6.93e-06, + "loss": 0.0009, + "num_tokens": 12572384.0, + "reward": 3.922173023223877, + "reward_std": 0.20927712321281433, + "rewards/reward_fn/mean": 3.922173023223877, + "rewards/reward_fn/std": 0.20927709341049194, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 268.5625, + "completions/mean_terminated_length": 268.5625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.062016399122300496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.01246120806899853, + "learning_rate": 6.9279999999999996e-06, + "loss": 0.0005, + "num_tokens": 12607314.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1558.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 364.0, + "completions/mean_terminated_length": 364.0, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.06213188589906456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.021249523881124333, + "learning_rate": 6.926e-06, + "loss": 0.0008, + "num_tokens": 12632210.0, + "reward": 3.450819492340088, + "reward_std": 1.093404769897461, + "rewards/reward_fn/mean": 3.450819492340088, + "rewards/reward_fn/std": 1.093404769897461, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 95.65625, + "completions/mean_terminated_length": 95.65625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.06224737267582862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.018108526259311475, + "learning_rate": 6.924e-06, + "loss": 0.0007, + "num_tokens": 12646567.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 56.3125, + "completions/mean_terminated_length": 56.3125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.06236285945259268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1708984375, + "kl": 0.016451794406748377, + "learning_rate": 6.921999999999999e-06, + "loss": 0.0007, + "num_tokens": 12660017.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 158.34375, + "completions/mean_terminated_length": 158.34375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.06247834622935674, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.02186195956892334, + "learning_rate": 6.92e-06, + "loss": 0.0009, + "num_tokens": 12683356.0, + "reward": 3.9314568042755127, + "reward_std": 0.3877388834953308, + "rewards/reward_fn/mean": 3.9314568042755127, + "rewards/reward_fn/std": 0.3877389132976532, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 54.59375, + "completions/mean_terminated_length": 54.59375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.0625938330061208, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.6875, + "kl": 0.014484454812190961, + "learning_rate": 6.918e-06, + "loss": 0.0006, + "num_tokens": 12701807.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 97.125, + "completions/mean_terminated_length": 97.125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.06270931978288485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.008275303545815405, + "learning_rate": 6.916e-06, + "loss": 0.0003, + "num_tokens": 12727283.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 168.5, + "completions/mean_terminated_length": 168.5, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.06282480655964892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.011480208137072623, + "learning_rate": 6.913999999999999e-06, + "loss": 0.0005, + "num_tokens": 12756771.0, + "reward": 2.873401641845703, + "reward_std": 0.04115013778209686, + "rewards/reward_fn/mean": 2.873401641845703, + "rewards/reward_fn/std": 0.041150107979774475, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 180.25, + "completions/mean_terminated_length": 180.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.06294029333641298, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.031000380316982046, + "learning_rate": 6.911999999999999e-06, + "loss": 0.0012, + "num_tokens": 12785483.0, + "reward": 3.0330026149749756, + "reward_std": 0.42482730746269226, + "rewards/reward_fn/mean": 3.0330026149749756, + "rewards/reward_fn/std": 0.42482733726501465, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 108.8125, + "completions/mean_terminated_length": 108.8125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.06305578011317704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.162109375, + "kl": 0.025636094360379502, + "learning_rate": 6.91e-06, + "loss": 0.001, + "num_tokens": 12812549.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 264.75, + "completions/mean_terminated_length": 207.22579956054688, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.0631712668899411, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.61328125, + "kl": 0.020440938737010583, + "learning_rate": 6.908e-06, + "loss": 0.0008, + "num_tokens": 12840477.0, + "reward": 3.8485050201416016, + "reward_std": 0.7180711030960083, + "rewards/reward_fn/mean": 3.8485050201416016, + "rewards/reward_fn/std": 0.7180710434913635, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 61.75, + "completions/mean_terminated_length": 61.75, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.06328675366670516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.008478778298012912, + "learning_rate": 6.906e-06, + "loss": 0.0003, + "num_tokens": 12852245.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 223.4375, + "completions/mean_terminated_length": 223.4375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.06340224044346922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.015855924284551293, + "learning_rate": 6.9039999999999995e-06, + "loss": 0.0006, + "num_tokens": 12872291.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 319.21875, + "completions/mean_terminated_length": 319.21875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.06351772722023329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.015726034907856956, + "learning_rate": 6.902e-06, + "loss": 0.0006, + "num_tokens": 12894954.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 250.0, + "completions/mean_terminated_length": 250.0, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.06363321399699734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.020727086201077327, + "learning_rate": 6.9e-06, + "loss": 0.0008, + "num_tokens": 12925066.0, + "reward": 3.5520529747009277, + "reward_std": 0.7365944981575012, + "rewards/reward_fn/mean": 3.5520529747009277, + "rewards/reward_fn/std": 0.7365943789482117, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 105.90625, + "completions/mean_terminated_length": 105.90625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.06374870077376141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.018261660254211165, + "learning_rate": 6.897999999999999e-06, + "loss": 0.0007, + "num_tokens": 12949191.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 104.96875, + "completions/mean_terminated_length": 104.96875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.06386418755052546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.01964278436207678, + "learning_rate": 6.896e-06, + "loss": 0.0008, + "num_tokens": 12964646.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 281.875, + "completions/mean_terminated_length": 281.875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.06397967432728953, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.014959082705900073, + "learning_rate": 6.8939999999999996e-06, + "loss": 0.0006, + "num_tokens": 12986722.0, + "reward": 3.791086435317993, + "reward_std": 0.5242055654525757, + "rewards/reward_fn/mean": 3.791086435317993, + "rewards/reward_fn/std": 0.5242055654525757, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 118.40625, + "completions/mean_terminated_length": 118.40625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.06409516110405358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.015126187558053061, + "learning_rate": 6.892e-06, + "loss": 0.0006, + "num_tokens": 13004239.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 178.28125, + "completions/mean_terminated_length": 178.28125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.06421064788081765, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.34375, + "kl": 0.022996881656581536, + "learning_rate": 6.889999999999999e-06, + "loss": 0.0009, + "num_tokens": 13028280.0, + "reward": 2.9897968769073486, + "reward_std": 0.234219491481781, + "rewards/reward_fn/mean": 2.9897968769073486, + "rewards/reward_fn/std": 0.2342195063829422, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 84.53125, + "completions/mean_terminated_length": 84.53125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.0643261346575817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.009592237754986854, + "learning_rate": 6.888e-06, + "loss": 0.0004, + "num_tokens": 13044681.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 228.96875, + "completions/mean_terminated_length": 228.96875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.06444162143434577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.01582691264047753, + "learning_rate": 6.886e-06, + "loss": 0.0006, + "num_tokens": 13063976.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 114.8125, + "completions/mean_terminated_length": 114.8125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.06455710821110983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.012762256752466783, + "learning_rate": 6.884e-06, + "loss": 0.0005, + "num_tokens": 13083650.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 94.46875, + "completions/mean_terminated_length": 94.46875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.0646725949878739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.01803586244932376, + "learning_rate": 6.8819999999999995e-06, + "loss": 0.0007, + "num_tokens": 13098097.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 135.28125, + "completions/mean_terminated_length": 135.28125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.06478808176463795, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "kl": 0.01668475230690092, + "learning_rate": 6.879999999999999e-06, + "loss": 0.0007, + "num_tokens": 13126362.0, + "reward": 3.8544533252716064, + "reward_std": 0.5727260708808899, + "rewards/reward_fn/mean": 3.8544533252716064, + "rewards/reward_fn/std": 0.5727260112762451, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 360.4375, + "completions/mean_terminated_length": 306.0, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.06490356854140202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.022362300398526713, + "learning_rate": 6.878e-06, + "loss": 0.0009, + "num_tokens": 13156200.0, + "reward": 3.373575210571289, + "reward_std": 0.8674547076225281, + "rewards/reward_fn/mean": 3.373575210571289, + "rewards/reward_fn/std": 0.8674547076225281, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.06501905531816607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.013571206480264664, + "learning_rate": 6.876e-06, + "loss": 0.0005, + "num_tokens": 13180158.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 98.5625, + "completions/mean_terminated_length": 98.5625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.06513454209493014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.014166056716931053, + "learning_rate": 6.874e-06, + "loss": 0.0006, + "num_tokens": 13195376.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 115.59375, + "completions/mean_terminated_length": 115.59375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.06525002887169419, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.014086434894124977, + "learning_rate": 6.872e-06, + "loss": 0.0006, + "num_tokens": 13217411.0, + "reward": 3.4634108543395996, + "reward_std": 0.10574649274349213, + "rewards/reward_fn/mean": 3.4634108543395996, + "rewards/reward_fn/std": 0.10574649274349213, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 277.65625, + "completions/mean_terminated_length": 277.65625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.06536551564845826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.011513520177686587, + "learning_rate": 6.8699999999999994e-06, + "loss": 0.0005, + "num_tokens": 13251096.0, + "reward": 3.2190260887145996, + "reward_std": 0.6174677014350891, + "rewards/reward_fn/mean": 3.2190260887145996, + "rewards/reward_fn/std": 0.6174677014350891, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 174.6875, + "completions/mean_terminated_length": 174.6875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.06548100242522231, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.010542570213146973, + "learning_rate": 6.868e-06, + "loss": 0.0004, + "num_tokens": 13278702.0, + "reward": 3.9683518409729004, + "reward_std": 0.17902909219264984, + "rewards/reward_fn/mean": 3.9683518409729004, + "rewards/reward_fn/std": 0.17902910709381104, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 102.0, + "completions/mean_terminated_length": 102.0, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.06559648920198637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.013615573887364008, + "learning_rate": 6.865999999999999e-06, + "loss": 0.0005, + "num_tokens": 13300430.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 84.40625, + "completions/mean_terminated_length": 84.40625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.06571197597875043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.009162359427136835, + "learning_rate": 6.864e-06, + "loss": 0.0004, + "num_tokens": 13328763.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 126.15625, + "completions/mean_terminated_length": 126.15625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.06582746275551449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.012217605268233456, + "learning_rate": 6.862e-06, + "loss": 0.0005, + "num_tokens": 13351840.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 93.625, + "completions/mean_terminated_length": 93.625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.06594294953227856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.014706676091009285, + "learning_rate": 6.86e-06, + "loss": 0.0006, + "num_tokens": 13372980.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 123.8125, + "completions/mean_terminated_length": 123.8125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.06605843630904261, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.04481723572826013, + "learning_rate": 6.857999999999999e-06, + "loss": 0.0018, + "num_tokens": 13395246.0, + "reward": 3.97526478767395, + "reward_std": 0.1399237960577011, + "rewards/reward_fn/mean": 3.97526478767395, + "rewards/reward_fn/std": 0.13992378115653992, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 158.375, + "completions/mean_terminated_length": 158.375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.06617392308580668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.012781327284756117, + "learning_rate": 6.855999999999999e-06, + "loss": 0.0005, + "num_tokens": 13415610.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 109.1875, + "completions/mean_terminated_length": 109.1875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.06628940986257073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.017363852879498154, + "learning_rate": 6.854e-06, + "loss": 0.0007, + "num_tokens": 13436832.0, + "reward": 3.928508996963501, + "reward_std": 0.404414564371109, + "rewards/reward_fn/mean": 3.928508996963501, + "rewards/reward_fn/std": 0.4044146239757538, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 220.0, + "completions/mean_terminated_length": 220.0, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.0664048966393348, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.011121291216113605, + "learning_rate": 6.852e-06, + "loss": 0.0004, + "num_tokens": 13464448.0, + "reward": 3.9279894828796387, + "reward_std": 0.40735292434692383, + "rewards/reward_fn/mean": 3.9279894828796387, + "rewards/reward_fn/std": 0.40735292434692383, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 165.875, + "completions/mean_terminated_length": 165.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.06652038341609885, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.01373782034352189, + "learning_rate": 6.85e-06, + "loss": 0.0005, + "num_tokens": 13482588.0, + "reward": 3.9295833110809326, + "reward_std": 0.39833691716194153, + "rewards/reward_fn/mean": 3.9295833110809326, + "rewards/reward_fn/std": 0.39833691716194153, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 173.59375, + "completions/mean_terminated_length": 173.59375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.06663587019286292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.014588659032597207, + "learning_rate": 6.8479999999999995e-06, + "loss": 0.0006, + "num_tokens": 13506319.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 170.0, + "completions/mean_terminated_length": 170.0, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.06675135696962697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.01492637165938504, + "learning_rate": 6.846e-06, + "loss": 0.0006, + "num_tokens": 13528463.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 95.1875, + "completions/mean_terminated_length": 95.1875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.06686684374639104, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.011823240827652626, + "learning_rate": 6.844e-06, + "loss": 0.0005, + "num_tokens": 13554229.0, + "reward": 3.9851861000061035, + "reward_std": 0.08380056917667389, + "rewards/reward_fn/mean": 3.9851861000061035, + "rewards/reward_fn/std": 0.0838005393743515, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.0669823305231551, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.022643631586106494, + "learning_rate": 6.841999999999999e-06, + "loss": 0.0009, + "num_tokens": 13581533.0, + "reward": 3.9329159259796143, + "reward_std": 0.37948447465896606, + "rewards/reward_fn/mean": 3.9329159259796143, + "rewards/reward_fn/std": 0.3794844448566437, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 301.40625, + "completions/mean_terminated_length": 301.40625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.06709781729991916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.01496353987022303, + "learning_rate": 6.84e-06, + "loss": 0.0006, + "num_tokens": 13605770.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 86.875, + "completions/mean_terminated_length": 86.875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.06721330407668322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.00567904140916653, + "learning_rate": 6.838e-06, + "loss": 0.0002, + "num_tokens": 13622790.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 173.9375, + "completions/mean_terminated_length": 173.9375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.06732879085344728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.010035865561803803, + "learning_rate": 6.836e-06, + "loss": 0.0004, + "num_tokens": 13653604.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 218.375, + "completions/mean_terminated_length": 218.375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.06744427763021134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.01446740358369425, + "learning_rate": 6.833999999999999e-06, + "loss": 0.0006, + "num_tokens": 13684080.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 62.0, + "completions/mean_terminated_length": 62.0, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.0675597644069754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.008722342230612412, + "learning_rate": 6.831999999999999e-06, + "loss": 0.0003, + "num_tokens": 13697872.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 267.875, + "completions/mean_terminated_length": 267.875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.06767525118373946, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.023831430007703602, + "learning_rate": 6.83e-06, + "loss": 0.001, + "num_tokens": 13723756.0, + "reward": 3.5335240364074707, + "reward_std": 0.5119786858558655, + "rewards/reward_fn/mean": 3.5335240364074707, + "rewards/reward_fn/std": 0.5119786858558655, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 60.46875, + "completions/mean_terminated_length": 60.46875, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.06779073796050353, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.90625, + "kl": 0.03366078995168209, + "learning_rate": 6.828e-06, + "loss": 0.0013, + "num_tokens": 13751131.0, + "reward": 3.75, + "reward_std": 0.9837387204170227, + "rewards/reward_fn/mean": 3.75, + "rewards/reward_fn/std": 0.9837387204170227, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 226.90625, + "completions/mean_terminated_length": 226.90625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.06790622473726758, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.01140687607403379, + "learning_rate": 6.8259999999999995e-06, + "loss": 0.0005, + "num_tokens": 13768024.0, + "reward": 3.932441234588623, + "reward_std": 0.3821699321269989, + "rewards/reward_fn/mean": 3.932441234588623, + "rewards/reward_fn/std": 0.3821699619293213, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 193.3125, + "completions/mean_terminated_length": 193.3125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.06802171151403165, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.021920470710028894, + "learning_rate": 6.823999999999999e-06, + "loss": 0.0009, + "num_tokens": 13795458.0, + "reward": 3.6304123401641846, + "reward_std": 0.5204261541366577, + "rewards/reward_fn/mean": 3.6304123401641846, + "rewards/reward_fn/std": 0.5204262137413025, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 204.46875, + "completions/mean_terminated_length": 204.46875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.0681371982907957, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.018768813271890394, + "learning_rate": 6.822e-06, + "loss": 0.0008, + "num_tokens": 13821105.0, + "reward": 3.7798290252685547, + "reward_std": 0.5004222393035889, + "rewards/reward_fn/mean": 3.7798290252685547, + "rewards/reward_fn/std": 0.5004222393035889, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 253.25, + "completions/mean_terminated_length": 253.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.06825268506755977, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.01715254387818277, + "learning_rate": 6.82e-06, + "loss": 0.0007, + "num_tokens": 13847481.0, + "reward": 2.763941526412964, + "reward_std": 0.6772220134735107, + "rewards/reward_fn/mean": 2.763941526412964, + "rewards/reward_fn/std": 0.6772220134735107, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 196.75, + "completions/mean_terminated_length": 196.75, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.06836817184432382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.023650736577110365, + "learning_rate": 6.817999999999999e-06, + "loss": 0.0009, + "num_tokens": 13876913.0, + "reward": 3.980940818786621, + "reward_std": 0.10781513899564743, + "rewards/reward_fn/mean": 3.980940818786621, + "rewards/reward_fn/std": 0.10781513899564743, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 297.6875, + "completions/mean_terminated_length": 297.6875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.06848365862108789, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.01821250023203902, + "learning_rate": 6.816e-06, + "loss": 0.0007, + "num_tokens": 13904743.0, + "reward": 3.4050683975219727, + "reward_std": 0.7249109745025635, + "rewards/reward_fn/mean": 3.4050683975219727, + "rewards/reward_fn/std": 0.7249109148979187, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 88.25, + "completions/mean_terminated_length": 88.25, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.06859914539785195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2158203125, + "kl": 0.022071643084927928, + "learning_rate": 6.8139999999999995e-06, + "loss": 0.0009, + "num_tokens": 13926863.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 193.40625, + "completions/mean_terminated_length": 193.40625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.068714632174616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.01405572413932532, + "learning_rate": 6.812e-06, + "loss": 0.0006, + "num_tokens": 13958428.0, + "reward": 3.928354501724243, + "reward_std": 0.40528780221939087, + "rewards/reward_fn/mean": 3.928354501724243, + "rewards/reward_fn/std": 0.40528786182403564, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 177.71875, + "completions/mean_terminated_length": 177.71875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.06883011895138007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.01644651120295748, + "learning_rate": 6.809999999999999e-06, + "loss": 0.0007, + "num_tokens": 13975507.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 345.125, + "completions/mean_terminated_length": 345.125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.06894560572814412, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.01712788548320532, + "learning_rate": 6.808e-06, + "loss": 0.0007, + "num_tokens": 13998487.0, + "reward": 3.9304304122924805, + "reward_std": 0.39354580640792847, + "rewards/reward_fn/mean": 3.9304304122924805, + "rewards/reward_fn/std": 0.39354580640792847, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 184.25, + "completions/mean_terminated_length": 184.25, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.06906109250490819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.01356002941611223, + "learning_rate": 6.806e-06, + "loss": 0.0005, + "num_tokens": 14012255.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 211.96875, + "completions/mean_terminated_length": 211.96875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.06917657928167224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.01709217372990679, + "learning_rate": 6.8039999999999996e-06, + "loss": 0.0007, + "num_tokens": 14031006.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 98.0, + "completions/mean_terminated_length": 98.0, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.06929206605843631, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.016174260643310845, + "learning_rate": 6.801999999999999e-06, + "loss": 0.0006, + "num_tokens": 14054270.0, + "reward": 3.9712188243865967, + "reward_std": 0.16281075775623322, + "rewards/reward_fn/mean": 3.9712188243865967, + "rewards/reward_fn/std": 0.16281074285507202, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 212.5, + "completions/mean_terminated_length": 212.5, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.06940755283520036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.01597186412254814, + "learning_rate": 6.799999999999999e-06, + "loss": 0.0006, + "num_tokens": 14073774.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1066.0, + "completions/mean_length": 490.15625, + "completions/mean_terminated_length": 439.9031982421875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.06952303961196443, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.01460757754102815, + "learning_rate": 6.798e-06, + "loss": 0.0006, + "num_tokens": 14102323.0, + "reward": 3.7259864807128906, + "reward_std": 0.8974677324295044, + "rewards/reward_fn/mean": 3.7259864807128906, + "rewards/reward_fn/std": 0.8974677920341492, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 189.34375, + "completions/mean_terminated_length": 189.34375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.06963852638872849, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.01807582091714721, + "learning_rate": 6.796e-06, + "loss": 0.0007, + "num_tokens": 14132382.0, + "reward": 3.106616497039795, + "reward_std": 0.0966046005487442, + "rewards/reward_fn/mean": 3.106616497039795, + "rewards/reward_fn/std": 0.09660462290048599, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 171.3125, + "completions/mean_terminated_length": 171.3125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.06975401316549255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.006591935358301271, + "learning_rate": 6.794e-06, + "loss": 0.0003, + "num_tokens": 14161576.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 106.1875, + "completions/mean_terminated_length": 106.1875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.06986949994225661, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.009572153230692493, + "learning_rate": 6.7919999999999995e-06, + "loss": 0.0004, + "num_tokens": 14189742.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 210.875, + "completions/mean_terminated_length": 210.875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.06998498671902068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.014765210202313028, + "learning_rate": 6.789999999999999e-06, + "loss": 0.0006, + "num_tokens": 14208810.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 116.125, + "completions/mean_terminated_length": 116.125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.07010047349578473, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.01517269414034672, + "learning_rate": 6.788e-06, + "loss": 0.0006, + "num_tokens": 14228622.0, + "reward": 3.4940528869628906, + "reward_std": 0.12900099158287048, + "rewards/reward_fn/mean": 3.4940528869628906, + "rewards/reward_fn/std": 0.12900099158287048, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 127.8125, + "completions/mean_terminated_length": 127.8125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.0702159602725488, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.021258387103443965, + "learning_rate": 6.785999999999999e-06, + "loss": 0.0008, + "num_tokens": 14250792.0, + "reward": 3.9697513580322266, + "reward_std": 0.17111161351203918, + "rewards/reward_fn/mean": 3.9697513580322266, + "rewards/reward_fn/std": 0.17111161351203918, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 279.28125, + "completions/mean_terminated_length": 279.28125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.07033144704931285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.016458050813525915, + "learning_rate": 6.784e-06, + "loss": 0.0007, + "num_tokens": 14271633.0, + "reward": 3.307446002960205, + "reward_std": 1.0437908172607422, + "rewards/reward_fn/mean": 3.307446002960205, + "rewards/reward_fn/std": 1.0437906980514526, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 74.71875, + "completions/mean_terminated_length": 74.71875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.07044693382607692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.012487782292737393, + "learning_rate": 6.782e-06, + "loss": 0.0005, + "num_tokens": 14295016.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 275.65625, + "completions/mean_terminated_length": 275.65625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.07056242060284097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.024515047145541757, + "learning_rate": 6.78e-06, + "loss": 0.001, + "num_tokens": 14322845.0, + "reward": 2.7363147735595703, + "reward_std": 0.4068790376186371, + "rewards/reward_fn/mean": 2.7363147735595703, + "rewards/reward_fn/std": 0.4068790078163147, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 197.90625, + "completions/mean_terminated_length": 197.90625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.07067790737960504, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.017937280703336, + "learning_rate": 6.777999999999999e-06, + "loss": 0.0007, + "num_tokens": 14347258.0, + "reward": 3.8496782779693604, + "reward_std": 0.29902949929237366, + "rewards/reward_fn/mean": 3.8496782779693604, + "rewards/reward_fn/std": 0.29902949929237366, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 229.40625, + "completions/mean_terminated_length": 229.40625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.0707933941563691, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.014664209113107063, + "learning_rate": 6.775999999999999e-06, + "loss": 0.0006, + "num_tokens": 14376423.0, + "reward": 3.8350167274475098, + "reward_std": 0.5551104545593262, + "rewards/reward_fn/mean": 3.8350167274475098, + "rewards/reward_fn/std": 0.5551104545593262, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 166.5625, + "completions/mean_terminated_length": 166.5625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.07090888093313316, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.019016951642697677, + "learning_rate": 6.774e-06, + "loss": 0.0008, + "num_tokens": 14398841.0, + "reward": 3.8222827911376953, + "reward_std": 0.3760036826133728, + "rewards/reward_fn/mean": 3.8222827911376953, + "rewards/reward_fn/std": 0.3760036528110504, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 94.03125, + "completions/mean_terminated_length": 94.03125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.07102436770989722, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875, + "kl": 0.02925722030340694, + "learning_rate": 6.772e-06, + "loss": 0.0012, + "num_tokens": 14423258.0, + "reward": 3.410187005996704, + "reward_std": 0.0453934520483017, + "rewards/reward_fn/mean": 3.410187005996704, + "rewards/reward_fn/std": 0.04539349302649498, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 301.6875, + "completions/mean_terminated_length": 301.6875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.07113985448666128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.009802420987398364, + "learning_rate": 6.7699999999999996e-06, + "loss": 0.0004, + "num_tokens": 14454480.0, + "reward": 3.5938122272491455, + "reward_std": 0.6413049697875977, + "rewards/reward_fn/mean": 3.5938122272491455, + "rewards/reward_fn/std": 0.6413049697875977, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 279.46875, + "completions/mean_terminated_length": 279.46875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.07125534126342534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.014040116991964169, + "learning_rate": 6.767999999999999e-06, + "loss": 0.0006, + "num_tokens": 14474591.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 119.25, + "completions/mean_terminated_length": 119.25, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.0713708280401894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.022073402782552876, + "learning_rate": 6.766e-06, + "loss": 0.0009, + "num_tokens": 14497415.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 112.0625, + "completions/mean_terminated_length": 112.0625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.07148631481695346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.01201180815405678, + "learning_rate": 6.764e-06, + "loss": 0.0005, + "num_tokens": 14515177.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 153.8125, + "completions/mean_terminated_length": 153.8125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.07160180159371753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.01367285534797702, + "learning_rate": 6.761999999999999e-06, + "loss": 0.0005, + "num_tokens": 14536291.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 480.71875, + "completions/mean_terminated_length": 480.71875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.07171728837048158, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.008324610447743908, + "learning_rate": 6.76e-06, + "loss": 0.0003, + "num_tokens": 14576922.0, + "reward": 3.6267247200012207, + "reward_std": 0.7626547813415527, + "rewards/reward_fn/mean": 3.6267247200012207, + "rewards/reward_fn/std": 0.7626547813415527, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 185.96875, + "completions/mean_terminated_length": 185.96875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.07183277514724563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.017058526980690658, + "learning_rate": 6.7579999999999995e-06, + "loss": 0.0007, + "num_tokens": 14604153.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 76.40625, + "completions/mean_terminated_length": 76.40625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.0719482619240097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.008867462955095107, + "learning_rate": 6.756e-06, + "loss": 0.0004, + "num_tokens": 14616358.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 251.96875, + "completions/mean_terminated_length": 251.96875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.07206374870077376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.019144439618685283, + "learning_rate": 6.753999999999999e-06, + "loss": 0.0008, + "num_tokens": 14646437.0, + "reward": 3.020759105682373, + "reward_std": 0.24194949865341187, + "rewards/reward_fn/mean": 3.020759105682373, + "rewards/reward_fn/std": 0.24194952845573425, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 142.3125, + "completions/mean_terminated_length": 142.3125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.07217923547753782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.011660271877190098, + "learning_rate": 6.751999999999999e-06, + "loss": 0.0005, + "num_tokens": 14668719.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 176.875, + "completions/mean_terminated_length": 176.875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.07229472225430188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.011758154039853252, + "learning_rate": 6.75e-06, + "loss": 0.0005, + "num_tokens": 14688811.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 190.90625, + "completions/mean_terminated_length": 190.90625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.07241020903106594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.027842102572321892, + "learning_rate": 6.748e-06, + "loss": 0.0011, + "num_tokens": 14706984.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 87.65625, + "completions/mean_terminated_length": 87.65625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.07252569580783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.010969811519316863, + "learning_rate": 6.746e-06, + "loss": 0.0004, + "num_tokens": 14729085.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 106.28125, + "completions/mean_terminated_length": 106.28125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.07264118258459407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.01314940488373395, + "learning_rate": 6.743999999999999e-06, + "loss": 0.0005, + "num_tokens": 14744198.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 70.53125, + "completions/mean_terminated_length": 70.53125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.07275666936135812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.009465581686527003, + "learning_rate": 6.742e-06, + "loss": 0.0004, + "num_tokens": 14758519.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 127.625, + "completions/mean_terminated_length": 127.625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.07287215613812219, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.01231029682821827, + "learning_rate": 6.74e-06, + "loss": 0.0005, + "num_tokens": 14785259.0, + "reward": 3.760606050491333, + "reward_std": 0.6703487634658813, + "rewards/reward_fn/mean": 3.760606050491333, + "rewards/reward_fn/std": 0.6703488230705261, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 298.25, + "completions/mean_terminated_length": 298.25, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.07298764291488624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.013198252665461041, + "learning_rate": 6.7380000000000005e-06, + "loss": 0.0005, + "num_tokens": 14818387.0, + "reward": 2.668314218521118, + "reward_std": 0.18707385659217834, + "rewards/reward_fn/mean": 2.668314218521118, + "rewards/reward_fn/std": 0.18707388639450073, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 82.125, + "completions/mean_terminated_length": 82.125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.07310312969165031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.259765625, + "kl": 0.03625876706792042, + "learning_rate": 6.7359999999999995e-06, + "loss": 0.0015, + "num_tokens": 14836119.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 142.46875, + "completions/mean_terminated_length": 142.46875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.07321861646841436, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.01115440829016734, + "learning_rate": 6.733999999999999e-06, + "loss": 0.0004, + "num_tokens": 14857350.0, + "reward": 3.61527681350708, + "reward_std": 0.5820685029029846, + "rewards/reward_fn/mean": 3.61527681350708, + "rewards/reward_fn/std": 0.5820685029029846, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1402.0, + "completions/max_terminated_length": 1402.0, + "completions/mean_length": 438.15625, + "completions/mean_terminated_length": 438.15625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.07333410324517843, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.01591721171280369, + "learning_rate": 6.732e-06, + "loss": 0.0006, + "num_tokens": 14883755.0, + "reward": 3.929356575012207, + "reward_std": 0.39962005615234375, + "rewards/reward_fn/mean": 3.929356575012207, + "rewards/reward_fn/std": 0.39962008595466614, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 386.625, + "completions/mean_terminated_length": 333.0322570800781, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.07344959002194248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.388671875, + "kl": 0.011898589495103806, + "learning_rate": 6.73e-06, + "loss": 0.0005, + "num_tokens": 14905599.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 64.59375, + "completions/mean_terminated_length": 64.59375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.07356507679870655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.02224521127936896, + "learning_rate": 6.728e-06, + "loss": 0.0009, + "num_tokens": 14917298.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 172.78125, + "completions/mean_terminated_length": 172.78125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.0736805635754706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.015326504653785378, + "learning_rate": 6.726e-06, + "loss": 0.0006, + "num_tokens": 14943147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 124.59375, + "completions/mean_terminated_length": 124.59375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.07379605035223467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.012827393526094966, + "learning_rate": 6.7239999999999995e-06, + "loss": 0.0005, + "num_tokens": 14971998.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 145.125, + "completions/mean_terminated_length": 145.125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.07391153712899873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.01568839767423924, + "learning_rate": 6.722e-06, + "loss": 0.0006, + "num_tokens": 14988194.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 208.625, + "completions/mean_terminated_length": 208.625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.0740270239057628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.012641682202229276, + "learning_rate": 6.719999999999999e-06, + "loss": 0.0005, + "num_tokens": 15012118.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1119.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 347.6875, + "completions/mean_terminated_length": 347.6875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.07414251068252685, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.016168620641110465, + "learning_rate": 6.718e-06, + "loss": 0.0006, + "num_tokens": 15034892.0, + "reward": 3.4069113731384277, + "reward_std": 0.7451757192611694, + "rewards/reward_fn/mean": 3.4069113731384277, + "rewards/reward_fn/std": 0.7451757192611694, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 110.625, + "completions/mean_terminated_length": 110.625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.07425799745929092, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.01853263529483229, + "learning_rate": 6.716e-06, + "loss": 0.0007, + "num_tokens": 15052192.0, + "reward": 3.9784069061279297, + "reward_std": 0.12214864045381546, + "rewards/reward_fn/mean": 3.9784069061279297, + "rewards/reward_fn/std": 0.12214859575033188, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 257.3125, + "completions/mean_terminated_length": 257.3125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.07437348423605497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.014753123381524347, + "learning_rate": 6.7140000000000004e-06, + "loss": 0.0006, + "num_tokens": 15068554.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 200.84375, + "completions/mean_terminated_length": 200.84375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.07448897101281904, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.011029428205802105, + "learning_rate": 6.7119999999999994e-06, + "loss": 0.0004, + "num_tokens": 15089957.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 212.28125, + "completions/mean_terminated_length": 212.28125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.07460445778958309, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.014482890648650937, + "learning_rate": 6.709999999999999e-06, + "loss": 0.0006, + "num_tokens": 15123086.0, + "reward": 3.6936662197113037, + "reward_std": 0.760776162147522, + "rewards/reward_fn/mean": 3.6936662197113037, + "rewards/reward_fn/std": 0.760776162147522, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 186.9375, + "completions/mean_terminated_length": 186.9375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.07471994456634716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.012678372993832454, + "learning_rate": 6.708e-06, + "loss": 0.0005, + "num_tokens": 15141964.0, + "reward": 3.43575382232666, + "reward_std": 0.9930813908576965, + "rewards/reward_fn/mean": 3.43575382232666, + "rewards/reward_fn/std": 0.9930813908576965, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 159.40625, + "completions/mean_terminated_length": 159.40625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.07483543134311121, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.015388852130854502, + "learning_rate": 6.706e-06, + "loss": 0.0006, + "num_tokens": 15168345.0, + "reward": 3.939119577407837, + "reward_std": 0.23967918753623962, + "rewards/reward_fn/mean": 3.939119577407837, + "rewards/reward_fn/std": 0.23967920243740082, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 488.0, + "completions/mean_terminated_length": 488.0, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.07495091811987527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.008994526462629437, + "learning_rate": 6.704e-06, + "loss": 0.0004, + "num_tokens": 15208601.0, + "reward": 2.9307315349578857, + "reward_std": 0.8153274655342102, + "rewards/reward_fn/mean": 2.9307315349578857, + "rewards/reward_fn/std": 0.815327525138855, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 202.96875, + "completions/mean_terminated_length": 202.96875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.07506640489663934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.01118495487025939, + "learning_rate": 6.7019999999999995e-06, + "loss": 0.0004, + "num_tokens": 15230136.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 243.90625, + "completions/mean_terminated_length": 243.90625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.07518189167340339, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.011730508005712181, + "learning_rate": 6.7e-06, + "loss": 0.0005, + "num_tokens": 15258037.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 179.9375, + "completions/mean_terminated_length": 179.9375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.07529737845016746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.022565666091395542, + "learning_rate": 6.698e-06, + "loss": 0.0009, + "num_tokens": 15281555.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 225.1875, + "completions/mean_terminated_length": 225.1875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.07541286522693151, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.01915533942519687, + "learning_rate": 6.695999999999999e-06, + "loss": 0.0008, + "num_tokens": 15307833.0, + "reward": 3.5694050788879395, + "reward_std": 0.4984169006347656, + "rewards/reward_fn/mean": 3.5694050788879395, + "rewards/reward_fn/std": 0.4984169006347656, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 195.75, + "completions/mean_terminated_length": 195.75, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.07552835200369558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.01171800188603811, + "learning_rate": 6.694e-06, + "loss": 0.0005, + "num_tokens": 15326001.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 221.65625, + "completions/mean_terminated_length": 221.65625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.07564383878045963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.012750241468893364, + "learning_rate": 6.692e-06, + "loss": 0.0005, + "num_tokens": 15343814.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 121.15625, + "completions/mean_terminated_length": 121.15625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.0757593255572237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.004534903815510916, + "learning_rate": 6.69e-06, + "loss": 0.0002, + "num_tokens": 15371243.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 233.6875, + "completions/mean_terminated_length": 233.6875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.07587481233398775, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.011590625006647315, + "learning_rate": 6.687999999999999e-06, + "loss": 0.0005, + "num_tokens": 15404385.0, + "reward": 3.8922691345214844, + "reward_std": 0.41672661900520325, + "rewards/reward_fn/mean": 3.8922691345214844, + "rewards/reward_fn/std": 0.41672658920288086, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 261.90625, + "completions/mean_terminated_length": 261.90625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.07599029911075182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.012310201796935871, + "learning_rate": 6.686e-06, + "loss": 0.0005, + "num_tokens": 15422494.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 282.84375, + "completions/mean_terminated_length": 282.84375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.07610578588751588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.01130834397918079, + "learning_rate": 6.684e-06, + "loss": 0.0005, + "num_tokens": 15442265.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 224.375, + "completions/mean_terminated_length": 224.375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.07622127266427994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.011717482942913193, + "learning_rate": 6.682e-06, + "loss": 0.0005, + "num_tokens": 15464677.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 81.375, + "completions/mean_terminated_length": 81.375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.076336759441044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.007211482366983546, + "learning_rate": 6.6799999999999996e-06, + "loss": 0.0003, + "num_tokens": 15484369.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 64.34375, + "completions/mean_terminated_length": 64.34375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.07645224621780806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.396484375, + "kl": 0.025679815524199512, + "learning_rate": 6.677999999999999e-06, + "loss": 0.001, + "num_tokens": 15498556.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 106.9375, + "completions/mean_terminated_length": 106.9375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.07656773299457212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2138671875, + "kl": 0.011755686533433618, + "learning_rate": 6.676e-06, + "loss": 0.0005, + "num_tokens": 15526842.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 194.34375, + "completions/mean_terminated_length": 194.34375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.07668321977133619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.01808906353835482, + "learning_rate": 6.674e-06, + "loss": 0.0007, + "num_tokens": 15550053.0, + "reward": 2.9472129344940186, + "reward_std": 0.04423438757658005, + "rewards/reward_fn/mean": 2.9472129344940186, + "rewards/reward_fn/std": 0.044234417378902435, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 211.78125, + "completions/mean_terminated_length": 211.78125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.07679870654810024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.018325803728657775, + "learning_rate": 6.672e-06, + "loss": 0.0007, + "num_tokens": 15569086.0, + "reward": 3.9783871173858643, + "reward_std": 0.12226062268018723, + "rewards/reward_fn/mean": 3.9783871173858643, + "rewards/reward_fn/std": 0.12226063013076782, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 129.4375, + "completions/mean_terminated_length": 129.4375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.07691419332486431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.01644519744149875, + "learning_rate": 6.67e-06, + "loss": 0.0007, + "num_tokens": 15583404.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 174.4375, + "completions/mean_terminated_length": 174.4375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.07702968010162836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.01127798248489853, + "learning_rate": 6.6679999999999995e-06, + "loss": 0.0005, + "num_tokens": 15608186.0, + "reward": 3.1793761253356934, + "reward_std": 0.6201541423797607, + "rewards/reward_fn/mean": 3.1793761253356934, + "rewards/reward_fn/std": 0.6201541423797607, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 154.15625, + "completions/mean_terminated_length": 154.15625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.07714516687839243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21484375, + "kl": 0.013656770381203387, + "learning_rate": 6.666e-06, + "loss": 0.0005, + "num_tokens": 15630943.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 328.1875, + "completions/mean_terminated_length": 328.1875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.07726065365515648, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.018012127664405853, + "learning_rate": 6.663999999999999e-06, + "loss": 0.0007, + "num_tokens": 15657349.0, + "reward": 3.929792881011963, + "reward_std": 0.39715108275413513, + "rewards/reward_fn/mean": 3.929792881011963, + "rewards/reward_fn/std": 0.39715105295181274, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 115.34375, + "completions/mean_terminated_length": 115.34375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.07737614043192055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.010594958897854667, + "learning_rate": 6.662e-06, + "loss": 0.0004, + "num_tokens": 15681872.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 86.96875, + "completions/mean_terminated_length": 86.96875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.0774916272086846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1611328125, + "kl": 0.02039567823521793, + "learning_rate": 6.66e-06, + "loss": 0.0008, + "num_tokens": 15698127.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 371.0, + "completions/mean_terminated_length": 371.0, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.07760711398544867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.01698783195752185, + "learning_rate": 6.6580000000000005e-06, + "loss": 0.0007, + "num_tokens": 15725647.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 150.0625, + "completions/mean_terminated_length": 150.0625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.07772260076221273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.008323958623805083, + "learning_rate": 6.6559999999999995e-06, + "loss": 0.0003, + "num_tokens": 15745649.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 73.5, + "completions/mean_terminated_length": 73.5, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.0778380875389768, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.0062745205614191946, + "learning_rate": 6.653999999999999e-06, + "loss": 0.0003, + "num_tokens": 15767073.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 170.125, + "completions/mean_terminated_length": 170.125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.07795357431574085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.011709325422998518, + "learning_rate": 6.652e-06, + "loss": 0.0005, + "num_tokens": 15794213.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 213.25, + "completions/mean_terminated_length": 213.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.0780690610925049, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.015246755356201902, + "learning_rate": 6.65e-06, + "loss": 0.0006, + "num_tokens": 15823277.0, + "reward": 3.975043773651123, + "reward_std": 0.141173854470253, + "rewards/reward_fn/mean": 3.975043773651123, + "rewards/reward_fn/std": 0.141173854470253, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.07818454786926897, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.013915398856624961, + "learning_rate": 6.648e-06, + "loss": 0.0006, + "num_tokens": 15842785.0, + "reward": 3.977362632751465, + "reward_std": 0.12805631756782532, + "rewards/reward_fn/mean": 3.977362632751465, + "rewards/reward_fn/std": 0.1280563324689865, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 137.3125, + "completions/mean_terminated_length": 137.3125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.07830003464603302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.010810450228746049, + "learning_rate": 6.6459999999999996e-06, + "loss": 0.0004, + "num_tokens": 15859307.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 105.4375, + "completions/mean_terminated_length": 105.4375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.07841552142279709, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.01276430558937136, + "learning_rate": 6.643999999999999e-06, + "loss": 0.0005, + "num_tokens": 15882521.0, + "reward": 3.3605573177337646, + "reward_std": 0.02658889815211296, + "rewards/reward_fn/mean": 3.3605573177337646, + "rewards/reward_fn/std": 0.02658887952566147, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 80.1875, + "completions/mean_terminated_length": 80.1875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.07853100819956114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.008676814715727232, + "learning_rate": 6.642e-06, + "loss": 0.0003, + "num_tokens": 15900927.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 141.5625, + "completions/mean_terminated_length": 141.5625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.07864649497632521, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.024190454074414447, + "learning_rate": 6.639999999999999e-06, + "loss": 0.001, + "num_tokens": 15918545.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 144.25, + "completions/mean_terminated_length": 144.25, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.07876198175308927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.011123034622869454, + "learning_rate": 6.638e-06, + "loss": 0.0004, + "num_tokens": 15932921.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 160.09375, + "completions/mean_terminated_length": 160.09375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.07887746852985333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.011630476707068738, + "learning_rate": 6.636e-06, + "loss": 0.0005, + "num_tokens": 15951484.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 81.875, + "completions/mean_terminated_length": 81.875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.07899295530661739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.007028319079836365, + "learning_rate": 6.634e-06, + "loss": 0.0003, + "num_tokens": 15969848.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 155.625, + "completions/mean_terminated_length": 155.625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.07910844208338146, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.022434977494413033, + "learning_rate": 6.631999999999999e-06, + "loss": 0.0009, + "num_tokens": 15987660.0, + "reward": 3.1176369190216064, + "reward_std": 0.06829281896352768, + "rewards/reward_fn/mean": 3.1176369190216064, + "rewards/reward_fn/std": 0.06829281151294708, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 189.03125, + "completions/mean_terminated_length": 189.03125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.07922392886014551, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.014796425413805991, + "learning_rate": 6.629999999999999e-06, + "loss": 0.0006, + "num_tokens": 16006253.0, + "reward": 3.2597908973693848, + "reward_std": 0.45797237753868103, + "rewards/reward_fn/mean": 3.2597908973693848, + "rewards/reward_fn/std": 0.45797237753868103, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 223.9375, + "completions/mean_terminated_length": 223.9375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.07933941563690958, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.01898651325609535, + "learning_rate": 6.628e-06, + "loss": 0.0008, + "num_tokens": 16037259.0, + "reward": 3.243295431137085, + "reward_std": 0.5975750088691711, + "rewards/reward_fn/mean": 3.243295431137085, + "rewards/reward_fn/std": 0.5975750088691711, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 216.84375, + "completions/mean_terminated_length": 216.84375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.07945490241367363, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.012035488849505782, + "learning_rate": 6.626e-06, + "loss": 0.0005, + "num_tokens": 16057126.0, + "reward": 3.793842077255249, + "reward_std": 0.6512911915779114, + "rewards/reward_fn/mean": 3.793842077255249, + "rewards/reward_fn/std": 0.6512911319732666, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 81.0625, + "completions/mean_terminated_length": 81.0625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.0795703891904377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.008183398054825375, + "learning_rate": 6.624e-06, + "loss": 0.0003, + "num_tokens": 16077000.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 136.96875, + "completions/mean_terminated_length": 136.96875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.07968587596720175, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.375, + "kl": 0.014247542756493203, + "learning_rate": 6.6219999999999994e-06, + "loss": 0.0006, + "num_tokens": 16103367.0, + "reward": 3.9492239952087402, + "reward_std": 0.20003284513950348, + "rewards/reward_fn/mean": 3.9492239952087402, + "rewards/reward_fn/std": 0.2000328153371811, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 77.03125, + "completions/mean_terminated_length": 77.03125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.07980136274396582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.00576420166544267, + "learning_rate": 6.62e-06, + "loss": 0.0002, + "num_tokens": 16125896.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.07991684952072987, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.024820221937261522, + "learning_rate": 6.618e-06, + "loss": 0.001, + "num_tokens": 16147664.0, + "reward": 3.901778221130371, + "reward_std": 0.4189213514328003, + "rewards/reward_fn/mean": 3.901778221130371, + "rewards/reward_fn/std": 0.4189213812351227, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 104.4375, + "completions/mean_terminated_length": 104.4375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.08003233629749394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.020937092340318486, + "learning_rate": 6.615999999999999e-06, + "loss": 0.0008, + "num_tokens": 16169918.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 284.71875, + "completions/mean_terminated_length": 284.71875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.080147823074258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.013990055304020643, + "learning_rate": 6.614e-06, + "loss": 0.0006, + "num_tokens": 16203253.0, + "reward": 2.757824420928955, + "reward_std": 0.20931759476661682, + "rewards/reward_fn/mean": 2.757824420928955, + "rewards/reward_fn/std": 0.20931756496429443, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 206.375, + "completions/mean_terminated_length": 206.375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.08026330985102206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.021316090889740735, + "learning_rate": 6.6119999999999995e-06, + "loss": 0.0009, + "num_tokens": 16232289.0, + "reward": 3.813290596008301, + "reward_std": 0.4659895598888397, + "rewards/reward_fn/mean": 3.813290596008301, + "rewards/reward_fn/std": 0.4659895598888397, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 165.3125, + "completions/mean_terminated_length": 165.3125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.08037879662778612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.008363091706996784, + "learning_rate": 6.61e-06, + "loss": 0.0003, + "num_tokens": 16259851.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.08049428340455018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.018744904606137425, + "learning_rate": 6.607999999999999e-06, + "loss": 0.0007, + "num_tokens": 16287247.0, + "reward": 3.9433560371398926, + "reward_std": 0.22310912609100342, + "rewards/reward_fn/mean": 3.9433560371398926, + "rewards/reward_fn/std": 0.2231091558933258, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 207.59375, + "completions/mean_terminated_length": 207.59375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.08060977018131424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046875, + "kl": 0.009865265288681258, + "learning_rate": 6.606e-06, + "loss": 0.0004, + "num_tokens": 16312482.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.0807252569580783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.015040099620819092, + "learning_rate": 6.604e-06, + "loss": 0.0006, + "num_tokens": 16336718.0, + "reward": 3.1034274101257324, + "reward_std": 0.13892284035682678, + "rewards/reward_fn/mean": 3.1034274101257324, + "rewards/reward_fn/std": 0.13892287015914917, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 162.3125, + "completions/mean_terminated_length": 162.3125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.08084074373484236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.009201948560075834, + "learning_rate": 6.602e-06, + "loss": 0.0004, + "num_tokens": 16365432.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 153.5625, + "completions/mean_terminated_length": 153.5625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.08095623051160643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.026166830968577415, + "learning_rate": 6.5999999999999995e-06, + "loss": 0.001, + "num_tokens": 16392906.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 109.125, + "completions/mean_terminated_length": 109.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.08107171728837048, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.009153520477411803, + "learning_rate": 6.597999999999999e-06, + "loss": 0.0004, + "num_tokens": 16420046.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 285.3125, + "completions/mean_terminated_length": 285.3125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.08118720406513454, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.011573645126190968, + "learning_rate": 6.596e-06, + "loss": 0.0005, + "num_tokens": 16453528.0, + "reward": 3.9236578941345215, + "reward_std": 0.30040597915649414, + "rewards/reward_fn/mean": 3.9236578941345215, + "rewards/reward_fn/std": 0.30040600895881653, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 148.9375, + "completions/mean_terminated_length": 148.9375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.0813026908418986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.015838247869396582, + "learning_rate": 6.594e-06, + "loss": 0.0006, + "num_tokens": 16470902.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 261.84375, + "completions/mean_terminated_length": 261.84375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.08141817761866266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.01231937225384172, + "learning_rate": 6.592e-06, + "loss": 0.0005, + "num_tokens": 16495473.0, + "reward": 3.9332618713378906, + "reward_std": 0.3775279223918915, + "rewards/reward_fn/mean": 3.9332618713378906, + "rewards/reward_fn/std": 0.37752795219421387, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 333.9375, + "completions/mean_terminated_length": 333.9375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.08153366439542672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.012835842411732301, + "learning_rate": 6.59e-06, + "loss": 0.0005, + "num_tokens": 16521295.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 82.9375, + "completions/mean_terminated_length": 82.9375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.08164915117219078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.166015625, + "kl": 0.02010935707949102, + "learning_rate": 6.5879999999999994e-06, + "loss": 0.0008, + "num_tokens": 16541389.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 260.4375, + "completions/mean_terminated_length": 260.4375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.08176463794895485, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.02682579620159231, + "learning_rate": 6.586e-06, + "loss": 0.0011, + "num_tokens": 16571867.0, + "reward": 3.0690951347351074, + "reward_std": 0.10150482505559921, + "rewards/reward_fn/mean": 3.0690951347351074, + "rewards/reward_fn/std": 0.10150481760501862, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 94.34375, + "completions/mean_terminated_length": 94.34375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.0818801247257189, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "kl": 0.016205130974412896, + "learning_rate": 6.583999999999999e-06, + "loss": 0.0006, + "num_tokens": 16584294.0, + "reward": 3.929910182952881, + "reward_std": 0.396488219499588, + "rewards/reward_fn/mean": 3.929910182952881, + "rewards/reward_fn/std": 0.3964882493019104, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 109.9375, + "completions/mean_terminated_length": 109.9375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.08199561150248297, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.015544890069577377, + "learning_rate": 6.582e-06, + "loss": 0.0006, + "num_tokens": 16600356.0, + "reward": 2.7888975143432617, + "reward_std": 0.0237407386302948, + "rewards/reward_fn/mean": 2.7888975143432617, + "rewards/reward_fn/std": 0.023740731179714203, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 402.1875, + "completions/mean_terminated_length": 402.1875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.08211109827924702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.012428536465449724, + "learning_rate": 6.58e-06, + "loss": 0.0005, + "num_tokens": 16632042.0, + "reward": 3.058995485305786, + "reward_std": 0.604813814163208, + "rewards/reward_fn/mean": 3.058995485305786, + "rewards/reward_fn/std": 0.6048138737678528, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 399.25, + "completions/mean_terminated_length": 399.25, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.08222658505601109, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.015961515207891352, + "learning_rate": 6.578e-06, + "loss": 0.0006, + "num_tokens": 16661426.0, + "reward": 3.7142727375030518, + "reward_std": 0.7681114673614502, + "rewards/reward_fn/mean": 3.7142727375030518, + "rewards/reward_fn/std": 0.768111526966095, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 82.90625, + "completions/mean_terminated_length": 82.90625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.08234207183277514, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.078125, + "kl": 0.008463799616947654, + "learning_rate": 6.575999999999999e-06, + "loss": 0.0003, + "num_tokens": 16683535.0, + "reward": 2.869783878326416, + "reward_std": 0.028266483917832375, + "rewards/reward_fn/mean": 2.869783878326416, + "rewards/reward_fn/std": 0.02826651558279991, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 379.8125, + "completions/mean_terminated_length": 379.8125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.08245755860953921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.013846950751030818, + "learning_rate": 6.573999999999999e-06, + "loss": 0.0006, + "num_tokens": 16711849.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 498.125, + "completions/mean_terminated_length": 498.125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.08257304538630326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.015818476531421766, + "learning_rate": 6.572e-06, + "loss": 0.0006, + "num_tokens": 16743885.0, + "reward": 3.575437545776367, + "reward_std": 0.8979739546775818, + "rewards/reward_fn/mean": 3.575437545776367, + "rewards/reward_fn/std": 0.8979739546775818, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 140.90625, + "completions/mean_terminated_length": 140.90625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.08268853216306733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.011443071940448135, + "learning_rate": 6.57e-06, + "loss": 0.0005, + "num_tokens": 16766538.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 211.125, + "completions/mean_terminated_length": 211.125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.08280401893983139, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.015821698325453326, + "learning_rate": 6.568e-06, + "loss": 0.0006, + "num_tokens": 16795854.0, + "reward": 3.483548641204834, + "reward_std": 0.4953596889972687, + "rewards/reward_fn/mean": 3.483548641204834, + "rewards/reward_fn/std": 0.4953596591949463, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 107.96875, + "completions/mean_terminated_length": 107.96875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.08291950571659545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.015371560206403956, + "learning_rate": 6.5659999999999995e-06, + "loss": 0.0006, + "num_tokens": 16812397.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 155.65625, + "completions/mean_terminated_length": 155.65625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.08303499249335951, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.015339405574195553, + "learning_rate": 6.564e-06, + "loss": 0.0006, + "num_tokens": 16829794.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 63.90625, + "completions/mean_terminated_length": 63.90625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.08315047927012358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.00677481759339571, + "learning_rate": 6.562e-06, + "loss": 0.0003, + "num_tokens": 16848255.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 58.65625, + "completions/mean_terminated_length": 58.65625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.08326596604688763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.255859375, + "kl": 0.025315830949693918, + "learning_rate": 6.559999999999999e-06, + "loss": 0.001, + "num_tokens": 16865748.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 190.25, + "completions/mean_terminated_length": 190.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.0833814528236517, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.020202811603667215, + "learning_rate": 6.558e-06, + "loss": 0.0008, + "num_tokens": 16890684.0, + "reward": 3.2887320518493652, + "reward_std": 0.7526115775108337, + "rewards/reward_fn/mean": 3.2887320518493652, + "rewards/reward_fn/std": 0.7526116371154785, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 143.25, + "completions/mean_terminated_length": 143.25, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.08349693960041575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.013554160890635103, + "learning_rate": 6.5559999999999996e-06, + "loss": 0.0005, + "num_tokens": 16906884.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 289.4375, + "completions/mean_terminated_length": 289.4375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.08361242637717982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.01372448970505502, + "learning_rate": 6.554e-06, + "loss": 0.0005, + "num_tokens": 16939026.0, + "reward": 3.345142364501953, + "reward_std": 0.5052719712257385, + "rewards/reward_fn/mean": 3.345142364501953, + "rewards/reward_fn/std": 0.5052719712257385, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 386.09375, + "completions/mean_terminated_length": 386.09375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.08372791315394387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040771484375, + "kl": 0.010779587464639917, + "learning_rate": 6.551999999999999e-06, + "loss": 0.0004, + "num_tokens": 16963349.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 277.0, + "completions/mean_terminated_length": 219.87095642089844, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.08384339993070794, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.016422645319835283, + "learning_rate": 6.549999999999999e-06, + "loss": 0.0007, + "num_tokens": 16990037.0, + "reward": 3.7798686027526855, + "reward_std": 0.8076668381690979, + "rewards/reward_fn/mean": 3.7798686027526855, + "rewards/reward_fn/std": 0.8076668381690979, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 48.3125, + "completions/mean_terminated_length": 48.3125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.083958886707472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.010986561530444305, + "learning_rate": 6.548e-06, + "loss": 0.0004, + "num_tokens": 17010559.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 106.90625, + "completions/mean_terminated_length": 106.90625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.08407437348423606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.020253256254363805, + "learning_rate": 6.546e-06, + "loss": 0.0008, + "num_tokens": 17025724.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 111.375, + "completions/mean_terminated_length": 111.375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.08418986026100012, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.017321078979875892, + "learning_rate": 6.5439999999999995e-06, + "loss": 0.0007, + "num_tokens": 17053384.0, + "reward": 3.9613254070281982, + "reward_std": 0.2187768816947937, + "rewards/reward_fn/mean": 3.9613254070281982, + "rewards/reward_fn/std": 0.2187769114971161, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 136.65625, + "completions/mean_terminated_length": 136.65625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.08430534703776417, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.02140157695976086, + "learning_rate": 6.541999999999999e-06, + "loss": 0.0009, + "num_tokens": 17077309.0, + "reward": 3.159794807434082, + "reward_std": 0.14663642644882202, + "rewards/reward_fn/mean": 3.159794807434082, + "rewards/reward_fn/std": 0.14663638174533844, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 193.625, + "completions/mean_terminated_length": 193.625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.08442083381452824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.010687121663067956, + "learning_rate": 6.54e-06, + "loss": 0.0004, + "num_tokens": 17091537.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 85.5625, + "completions/mean_terminated_length": 85.5625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.08453632059129229, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.012194456699944567, + "learning_rate": 6.538e-06, + "loss": 0.0005, + "num_tokens": 17119683.0, + "reward": 3.960900068283081, + "reward_std": 0.22118225693702698, + "rewards/reward_fn/mean": 3.960900068283081, + "rewards/reward_fn/std": 0.22118224203586578, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 110.53125, + "completions/mean_terminated_length": 110.53125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.08465180736805636, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.00726528847735608, + "learning_rate": 6.535999999999999e-06, + "loss": 0.0003, + "num_tokens": 17151956.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 154.9375, + "completions/mean_terminated_length": 154.9375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.08476729414482041, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.012261121868505143, + "learning_rate": 6.534e-06, + "loss": 0.0005, + "num_tokens": 17173010.0, + "reward": 3.975670337677002, + "reward_std": 0.13762937486171722, + "rewards/reward_fn/mean": 3.975670337677002, + "rewards/reward_fn/std": 0.1376294046640396, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 151.4375, + "completions/mean_terminated_length": 151.4375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.08488278092158448, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.026123055489733815, + "learning_rate": 6.5319999999999995e-06, + "loss": 0.001, + "num_tokens": 17194976.0, + "reward": 3.9565718173980713, + "reward_std": 0.17424504458904266, + "rewards/reward_fn/mean": 3.9565718173980713, + "rewards/reward_fn/std": 0.17424502968788147, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 101.34375, + "completions/mean_terminated_length": 101.34375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.08499826769834853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.010626475268509239, + "learning_rate": 6.53e-06, + "loss": 0.0004, + "num_tokens": 17219595.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 203.59375, + "completions/mean_terminated_length": 203.59375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.0851137544751126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060791015625, + "kl": 0.012696847901679575, + "learning_rate": 6.527999999999999e-06, + "loss": 0.0005, + "num_tokens": 17244254.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 368.21875, + "completions/mean_terminated_length": 368.21875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.08522924125187666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.010898931577685289, + "learning_rate": 6.526e-06, + "loss": 0.0004, + "num_tokens": 17271813.0, + "reward": 3.925661325454712, + "reward_std": 0.42052266001701355, + "rewards/reward_fn/mean": 3.925661325454712, + "rewards/reward_fn/std": 0.4205226004123688, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 224.6875, + "completions/mean_terminated_length": 224.6875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.08534472802864072, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.01514327142649563, + "learning_rate": 6.524e-06, + "loss": 0.0006, + "num_tokens": 17308443.0, + "reward": 3.549839973449707, + "reward_std": 0.4974403381347656, + "rewards/reward_fn/mean": 3.549839973449707, + "rewards/reward_fn/std": 0.49744030833244324, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 131.5625, + "completions/mean_terminated_length": 131.5625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.08546021480540478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.013247988405055366, + "learning_rate": 6.5219999999999996e-06, + "loss": 0.0005, + "num_tokens": 17331949.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 184.3125, + "completions/mean_terminated_length": 184.3125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.08557570158216884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.013781963876681402, + "learning_rate": 6.519999999999999e-06, + "loss": 0.0006, + "num_tokens": 17348727.0, + "reward": 3.615996837615967, + "reward_std": 0.5397602319717407, + "rewards/reward_fn/mean": 3.615996837615967, + "rewards/reward_fn/std": 0.539760172367096, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1141.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 384.5625, + "completions/mean_terminated_length": 384.5625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.0856911883589329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.012827640617615543, + "learning_rate": 6.517999999999999e-06, + "loss": 0.0005, + "num_tokens": 17372713.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 90.0625, + "completions/mean_terminated_length": 90.0625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.08580667513569697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1669921875, + "kl": 0.0194699231069535, + "learning_rate": 6.516e-06, + "loss": 0.0008, + "num_tokens": 17394091.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 246.46875, + "completions/mean_terminated_length": 246.46875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.08592216191246102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.015764574862259906, + "learning_rate": 6.514e-06, + "loss": 0.0006, + "num_tokens": 17419386.0, + "reward": 3.9728870391845703, + "reward_std": 0.1533740609884262, + "rewards/reward_fn/mean": 3.9728870391845703, + "rewards/reward_fn/std": 0.1533740609884262, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 91.75, + "completions/mean_terminated_length": 91.75, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.08603764868922509, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.01445609411894111, + "learning_rate": 6.512e-06, + "loss": 0.0006, + "num_tokens": 17439602.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 307.71875, + "completions/mean_terminated_length": 307.71875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.08615313546598914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.01316539736581035, + "learning_rate": 6.5099999999999995e-06, + "loss": 0.0005, + "num_tokens": 17457609.0, + "reward": 3.297980785369873, + "reward_std": 0.6291496753692627, + "rewards/reward_fn/mean": 3.297980785369873, + "rewards/reward_fn/std": 0.6291496753692627, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 256.3125, + "completions/mean_terminated_length": 256.3125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.08626862224275321, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.018829807231668383, + "learning_rate": 6.507999999999999e-06, + "loss": 0.0008, + "num_tokens": 17479699.0, + "reward": 3.6126513481140137, + "reward_std": 0.3957713544368744, + "rewards/reward_fn/mean": 3.6126513481140137, + "rewards/reward_fn/std": 0.3957712948322296, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 90.03125, + "completions/mean_terminated_length": 90.03125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.08638410901951726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.014519402218866162, + "learning_rate": 6.506e-06, + "loss": 0.0006, + "num_tokens": 17501556.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 172.28125, + "completions/mean_terminated_length": 172.28125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.08649959579628133, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.019807668679277413, + "learning_rate": 6.503999999999999e-06, + "loss": 0.0008, + "num_tokens": 17530205.0, + "reward": 3.899075984954834, + "reward_std": 0.42614004015922546, + "rewards/reward_fn/mean": 3.899075984954834, + "rewards/reward_fn/std": 0.42614004015922546, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 334.46875, + "completions/mean_terminated_length": 334.46875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.08661508257304538, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.011897420859895647, + "learning_rate": 6.502e-06, + "loss": 0.0005, + "num_tokens": 17555916.0, + "reward": 3.862682580947876, + "reward_std": 0.540401041507721, + "rewards/reward_fn/mean": 3.862682580947876, + "rewards/reward_fn/std": 0.5404009819030762, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 54.75, + "completions/mean_terminated_length": 54.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.08673056934980945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.01391594472079305, + "learning_rate": 6.5e-06, + "loss": 0.0006, + "num_tokens": 17578820.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 199.90625, + "completions/mean_terminated_length": 199.90625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.0868460561265735, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.01343564256967511, + "learning_rate": 6.498e-06, + "loss": 0.0005, + "num_tokens": 17604513.0, + "reward": 3.779747486114502, + "reward_std": 0.49151667952537537, + "rewards/reward_fn/mean": 3.779747486114502, + "rewards/reward_fn/std": 0.4915165901184082, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 417.875, + "completions/mean_terminated_length": 417.875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.08696154290333757, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.01391524655628018, + "learning_rate": 6.496e-06, + "loss": 0.0006, + "num_tokens": 17630301.0, + "reward": 3.7888779640197754, + "reward_std": 0.667057991027832, + "rewards/reward_fn/mean": 3.7888779640197754, + "rewards/reward_fn/std": 0.667057991027832, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 101.28125, + "completions/mean_terminated_length": 101.28125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.08707702968010163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.01607472461182624, + "learning_rate": 6.493999999999999e-06, + "loss": 0.0006, + "num_tokens": 17659494.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 150.5, + "completions/mean_terminated_length": 150.5, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.0871925164568657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.01531847461592406, + "learning_rate": 6.492e-06, + "loss": 0.0006, + "num_tokens": 17676246.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 76.84375, + "completions/mean_terminated_length": 76.84375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.08730800323362975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.012666107424593065, + "learning_rate": 6.49e-06, + "loss": 0.0005, + "num_tokens": 17691377.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 276.34375, + "completions/mean_terminated_length": 276.34375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.0874234900103938, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.01478759097517468, + "learning_rate": 6.488e-06, + "loss": 0.0006, + "num_tokens": 17722716.0, + "reward": 3.601099967956543, + "reward_std": 0.46946221590042114, + "rewards/reward_fn/mean": 3.601099967956543, + "rewards/reward_fn/std": 0.46946224570274353, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 147.75, + "completions/mean_terminated_length": 147.75, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.08753897678715787, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.703125, + "kl": 0.018179918581154197, + "learning_rate": 6.485999999999999e-06, + "loss": 0.0007, + "num_tokens": 17749204.0, + "reward": 3.915821075439453, + "reward_std": 0.26825594902038574, + "rewards/reward_fn/mean": 3.915821075439453, + "rewards/reward_fn/std": 0.26825594902038574, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 116.46875, + "completions/mean_terminated_length": 116.46875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.08765446356392192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.010327880758268293, + "learning_rate": 6.484e-06, + "loss": 0.0004, + "num_tokens": 17777443.0, + "reward": 3.9769697189331055, + "reward_std": 0.13027827441692352, + "rewards/reward_fn/mean": 3.9769697189331055, + "rewards/reward_fn/std": 0.13027827441692352, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 182.59375, + "completions/mean_terminated_length": 182.59375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.08776995034068599, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.01235117252508644, + "learning_rate": 6.482e-06, + "loss": 0.0005, + "num_tokens": 17802902.0, + "reward": 3.0161538124084473, + "reward_std": 0.224690243601799, + "rewards/reward_fn/mean": 3.0161538124084473, + "rewards/reward_fn/std": 0.2246902883052826, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 94.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 56.5625, + "completions/mean_terminated_length": 56.5625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.08788543711745005, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.828125, + "kl": 0.008957571591963642, + "learning_rate": 6.48e-06, + "loss": 0.0004, + "num_tokens": 17827784.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 85.28125, + "completions/mean_terminated_length": 85.28125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.08800092389421411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.009310553678005817, + "learning_rate": 6.478e-06, + "loss": 0.0004, + "num_tokens": 17842673.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 78.0625, + "completions/mean_terminated_length": 78.0625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.08811641067097817, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015625, + "kl": 0.009418475796337589, + "learning_rate": 6.4759999999999995e-06, + "loss": 0.0004, + "num_tokens": 17860915.0, + "reward": 3.1842989921569824, + "reward_std": 0.08834544569253922, + "rewards/reward_fn/mean": 3.1842989921569824, + "rewards/reward_fn/std": 0.08834543824195862, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 380.21875, + "completions/mean_terminated_length": 380.21875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.08823189744774224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.010985946384607814, + "learning_rate": 6.474e-06, + "loss": 0.0004, + "num_tokens": 17884986.0, + "reward": 3.9286582469940186, + "reward_std": 0.4035702347755432, + "rewards/reward_fn/mean": 3.9286582469940186, + "rewards/reward_fn/std": 0.4035702645778656, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 296.78125, + "completions/mean_terminated_length": 296.78125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.08834738422450629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.017023910622810945, + "learning_rate": 6.472e-06, + "loss": 0.0007, + "num_tokens": 17906355.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 171.59375, + "completions/mean_terminated_length": 171.59375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.08846287100127036, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.019266067378339358, + "learning_rate": 6.469999999999999e-06, + "loss": 0.0008, + "num_tokens": 17933126.0, + "reward": 3.8966286182403564, + "reward_std": 0.3277394771575928, + "rewards/reward_fn/mean": 3.8966286182403564, + "rewards/reward_fn/std": 0.3277394771575928, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 96.59375, + "completions/mean_terminated_length": 96.59375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.08857835777803441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.015831228061870206, + "learning_rate": 6.468e-06, + "loss": 0.0006, + "num_tokens": 17962073.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 240.3125, + "completions/mean_terminated_length": 240.3125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.08869384455479848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.010155490337638184, + "learning_rate": 6.466e-06, + "loss": 0.0004, + "num_tokens": 17986019.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 128.0625, + "completions/mean_terminated_length": 66.1290283203125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.08880933133156253, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.70703125, + "kl": 0.011328980199323269, + "learning_rate": 6.464e-06, + "loss": 0.0005, + "num_tokens": 18004613.0, + "reward": 3.933239459991455, + "reward_std": 0.3776543438434601, + "rewards/reward_fn/mean": 3.933239459991455, + "rewards/reward_fn/std": 0.37765440344810486, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 336.15625, + "completions/mean_terminated_length": 336.15625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.0889248181083266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.015827242998057045, + "learning_rate": 6.461999999999999e-06, + "loss": 0.0006, + "num_tokens": 18038026.0, + "reward": 3.059429883956909, + "reward_std": 0.26086851954460144, + "rewards/reward_fn/mean": 3.059429883956909, + "rewards/reward_fn/std": 0.26086854934692383, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 54.84375, + "completions/mean_terminated_length": 54.84375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.08904030488509065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.00968765276047634, + "learning_rate": 6.46e-06, + "loss": 0.0004, + "num_tokens": 18050853.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 110.3125, + "completions/mean_terminated_length": 110.3125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.08915579166185472, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.009062628534593387, + "learning_rate": 6.458e-06, + "loss": 0.0004, + "num_tokens": 18077935.0, + "reward": 3.93241024017334, + "reward_std": 0.38234513998031616, + "rewards/reward_fn/mean": 3.93241024017334, + "rewards/reward_fn/std": 0.38234516978263855, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 77.625, + "completions/mean_terminated_length": 77.625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.08927127843861878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.01402484267600812, + "learning_rate": 6.4560000000000005e-06, + "loss": 0.0006, + "num_tokens": 18110307.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1377.0, + "completions/max_terminated_length": 1377.0, + "completions/mean_length": 408.6875, + "completions/mean_terminated_length": 408.6875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.08938676521538284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.014567188933142461, + "learning_rate": 6.4539999999999995e-06, + "loss": 0.0006, + "num_tokens": 18135993.0, + "reward": 3.530622720718384, + "reward_std": 1.1553142070770264, + "rewards/reward_fn/mean": 3.530622720718384, + "rewards/reward_fn/std": 1.1553142070770264, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 167.03125, + "completions/mean_terminated_length": 167.03125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.0895022519921469, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.0208790720498655, + "learning_rate": 6.451999999999999e-06, + "loss": 0.0008, + "num_tokens": 18162138.0, + "reward": 3.953766345977783, + "reward_std": 0.1823812872171402, + "rewards/reward_fn/mean": 3.953766345977783, + "rewards/reward_fn/std": 0.182381272315979, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 227.125, + "completions/mean_terminated_length": 227.125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.08961773876891097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.01796996293705888, + "learning_rate": 6.45e-06, + "loss": 0.0007, + "num_tokens": 18193566.0, + "reward": 3.928284168243408, + "reward_std": 0.40568602085113525, + "rewards/reward_fn/mean": 3.928284168243408, + "rewards/reward_fn/std": 0.40568599104881287, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 136.71875, + "completions/mean_terminated_length": 136.71875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.08973322554567502, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875, + "kl": 0.019005796202691272, + "learning_rate": 6.448e-06, + "loss": 0.0008, + "num_tokens": 18216917.0, + "reward": 2.8551552295684814, + "reward_std": 0.04884541407227516, + "rewards/reward_fn/mean": 2.8551552295684814, + "rewards/reward_fn/std": 0.048845428973436356, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 162.34375, + "completions/mean_terminated_length": 162.34375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.08984871232243909, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.013407740356342401, + "learning_rate": 6.446e-06, + "loss": 0.0005, + "num_tokens": 18237408.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 57.09375, + "completions/mean_terminated_length": 57.09375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.08996419909920314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.0067436882891342975, + "learning_rate": 6.444e-06, + "loss": 0.0003, + "num_tokens": 18261443.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 178.6875, + "completions/mean_terminated_length": 178.6875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.09007968587596721, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.01418575741990935, + "learning_rate": 6.4419999999999995e-06, + "loss": 0.0006, + "num_tokens": 18283065.0, + "reward": 3.9702491760253906, + "reward_std": 0.16829590499401093, + "rewards/reward_fn/mean": 3.9702491760253906, + "rewards/reward_fn/std": 0.16829586029052734, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 399.59375, + "completions/mean_terminated_length": 399.59375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.09019517265273126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.011312860151519999, + "learning_rate": 6.44e-06, + "loss": 0.0005, + "num_tokens": 18305644.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 209.09375, + "completions/mean_terminated_length": 209.09375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.09031065942949533, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.01004182496399153, + "learning_rate": 6.437999999999999e-06, + "loss": 0.0004, + "num_tokens": 18334479.0, + "reward": 3.058117628097534, + "reward_std": 0.36548569798469543, + "rewards/reward_fn/mean": 3.058117628097534, + "rewards/reward_fn/std": 0.36548569798469543, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 145.90625, + "completions/mean_terminated_length": 145.90625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.09042614620625938, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.703125, + "kl": 0.014148225469398312, + "learning_rate": 6.436e-06, + "loss": 0.0006, + "num_tokens": 18357068.0, + "reward": 3.7914962768554688, + "reward_std": 0.6586650609970093, + "rewards/reward_fn/mean": 3.7914962768554688, + "rewards/reward_fn/std": 0.6586650609970093, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 162.6875, + "completions/mean_terminated_length": 162.6875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.09054163298302344, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.011830383067717776, + "learning_rate": 6.434e-06, + "loss": 0.0005, + "num_tokens": 18381442.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.0906571197597875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.014614566796808504, + "learning_rate": 6.432e-06, + "loss": 0.0006, + "num_tokens": 18400664.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 99.75, + "completions/mean_terminated_length": 99.75, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.09077260653655156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.011176039377460256, + "learning_rate": 6.429999999999999e-06, + "loss": 0.0004, + "num_tokens": 18424560.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 227.03125, + "completions/mean_terminated_length": 227.03125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.09088809331331563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.01642650338180829, + "learning_rate": 6.427999999999999e-06, + "loss": 0.0007, + "num_tokens": 18453809.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 174.375, + "completions/mean_terminated_length": 174.375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.09100358009007968, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.014707653870573267, + "learning_rate": 6.426e-06, + "loss": 0.0006, + "num_tokens": 18471261.0, + "reward": 3.9291164875030518, + "reward_std": 0.4009776711463928, + "rewards/reward_fn/mean": 3.9291164875030518, + "rewards/reward_fn/std": 0.40097764134407043, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 224.65625, + "completions/mean_terminated_length": 224.65625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.09111906686684375, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.015603951425873674, + "learning_rate": 6.424e-06, + "loss": 0.0006, + "num_tokens": 18489330.0, + "reward": 3.51389217376709, + "reward_std": 0.6518033146858215, + "rewards/reward_fn/mean": 3.51389217376709, + "rewards/reward_fn/std": 0.6518033146858215, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 127.1875, + "completions/mean_terminated_length": 127.1875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.0912345536436078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.013247496397525538, + "learning_rate": 6.422e-06, + "loss": 0.0005, + "num_tokens": 18518840.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 191.6875, + "completions/mean_terminated_length": 191.6875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.09135004042037187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.014100037646130659, + "learning_rate": 6.4199999999999995e-06, + "loss": 0.0006, + "num_tokens": 18540910.0, + "reward": 3.966266393661499, + "reward_std": 0.1908264458179474, + "rewards/reward_fn/mean": 3.966266393661499, + "rewards/reward_fn/std": 0.19082647562026978, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 124.34375, + "completions/mean_terminated_length": 124.34375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.09146552719713592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.007822913335985504, + "learning_rate": 6.418e-06, + "loss": 0.0003, + "num_tokens": 18566937.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 162.0625, + "completions/mean_terminated_length": 162.0625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.09158101397389999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.030320253950776532, + "learning_rate": 6.416e-06, + "loss": 0.0012, + "num_tokens": 18584379.0, + "reward": 3.8225350379943848, + "reward_std": 0.3566572368144989, + "rewards/reward_fn/mean": 3.8225350379943848, + "rewards/reward_fn/std": 0.3566572666168213, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 247.3125, + "completions/mean_terminated_length": 247.3125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.09169650075066405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.016466942804981954, + "learning_rate": 6.413999999999999e-06, + "loss": 0.0007, + "num_tokens": 18608325.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 53.21875, + "completions/mean_terminated_length": 53.21875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.09181198752742811, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2216796875, + "kl": 0.014468215245869942, + "learning_rate": 6.412e-06, + "loss": 0.0006, + "num_tokens": 18626316.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 167.5, + "completions/mean_terminated_length": 167.5, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.09192747430419217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.017303696418821346, + "learning_rate": 6.41e-06, + "loss": 0.0007, + "num_tokens": 18656412.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 68.09375, + "completions/mean_terminated_length": 68.09375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.09204296108095623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.166015625, + "kl": 0.014858884584100451, + "learning_rate": 6.408e-06, + "loss": 0.0006, + "num_tokens": 18672639.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 200.375, + "completions/mean_terminated_length": 200.375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.09215844785772029, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.012542223674245179, + "learning_rate": 6.405999999999999e-06, + "loss": 0.0005, + "num_tokens": 18700523.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 201.21875, + "completions/mean_terminated_length": 201.21875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.09227393463448436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.01368978362006601, + "learning_rate": 6.404e-06, + "loss": 0.0005, + "num_tokens": 18733906.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 219.25, + "completions/mean_terminated_length": 219.25, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.09238942141124841, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.012305409167311154, + "learning_rate": 6.402e-06, + "loss": 0.0005, + "num_tokens": 18752154.0, + "reward": 3.311436653137207, + "reward_std": 1.037879228591919, + "rewards/reward_fn/mean": 3.311436653137207, + "rewards/reward_fn/std": 1.037879228591919, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 220.40625, + "completions/mean_terminated_length": 220.40625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.09250490818801248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.011240910273045301, + "learning_rate": 6.4e-06, + "loss": 0.0004, + "num_tokens": 18780551.0, + "reward": 3.92850399017334, + "reward_std": 0.19271309673786163, + "rewards/reward_fn/mean": 3.92850399017334, + "rewards/reward_fn/std": 0.19271308183670044, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 130.4375, + "completions/mean_terminated_length": 130.4375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.09262039496477653, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.013922671147156507, + "learning_rate": 6.3979999999999996e-06, + "loss": 0.0006, + "num_tokens": 18801525.0, + "reward": 3.964094877243042, + "reward_std": 0.20311006903648376, + "rewards/reward_fn/mean": 3.964094877243042, + "rewards/reward_fn/std": 0.20311008393764496, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 196.9375, + "completions/mean_terminated_length": 196.9375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.0927358817415406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.03081478227977641, + "learning_rate": 6.395999999999999e-06, + "loss": 0.0012, + "num_tokens": 18832723.0, + "reward": 3.801238536834717, + "reward_std": 0.3309423327445984, + "rewards/reward_fn/mean": 3.801238536834717, + "rewards/reward_fn/std": 0.3309422731399536, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 169.375, + "completions/mean_terminated_length": 169.375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.09285136851830465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.01801215100567788, + "learning_rate": 6.394e-06, + "loss": 0.0007, + "num_tokens": 18861663.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 274.65625, + "completions/mean_terminated_length": 274.65625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.09296685529506872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.012154736017691903, + "learning_rate": 6.392e-06, + "loss": 0.0005, + "num_tokens": 18885428.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 329.1875, + "completions/mean_terminated_length": 329.1875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.09308234207183277, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.013658771480550058, + "learning_rate": 6.39e-06, + "loss": 0.0005, + "num_tokens": 18907770.0, + "reward": 3.927464485168457, + "reward_std": 0.41032281517982483, + "rewards/reward_fn/mean": 3.927464485168457, + "rewards/reward_fn/std": 0.41032278537750244, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 227.34375, + "completions/mean_terminated_length": 227.34375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.09319782884859684, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.014697650869493373, + "learning_rate": 6.388e-06, + "loss": 0.0006, + "num_tokens": 18934245.0, + "reward": 3.79226016998291, + "reward_std": 0.40370529890060425, + "rewards/reward_fn/mean": 3.79226016998291, + "rewards/reward_fn/std": 0.40370526909828186, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 150.65625, + "completions/mean_terminated_length": 150.65625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.0933133156253609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.011520944884978235, + "learning_rate": 6.3859999999999995e-06, + "loss": 0.0005, + "num_tokens": 18955098.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 182.9375, + "completions/mean_terminated_length": 182.9375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.09342880240212496, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.025381776387803257, + "learning_rate": 6.384e-06, + "loss": 0.001, + "num_tokens": 18985144.0, + "reward": 3.2019689083099365, + "reward_std": 0.16371193528175354, + "rewards/reward_fn/mean": 3.2019689083099365, + "rewards/reward_fn/std": 0.16371190547943115, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 82.21875, + "completions/mean_terminated_length": 82.21875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.09354428917888902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.009150829124337179, + "learning_rate": 6.381999999999999e-06, + "loss": 0.0004, + "num_tokens": 19008543.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 396.25, + "completions/mean_terminated_length": 396.25, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.09365977595565307, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.010796227856189944, + "learning_rate": 6.38e-06, + "loss": 0.0004, + "num_tokens": 19037927.0, + "reward": 3.7936949729919434, + "reward_std": 0.5335686802864075, + "rewards/reward_fn/mean": 3.7936949729919434, + "rewards/reward_fn/std": 0.5335686206817627, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 186.03125, + "completions/mean_terminated_length": 186.03125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.09377526273241714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.014643639005953446, + "learning_rate": 6.378e-06, + "loss": 0.0006, + "num_tokens": 19067912.0, + "reward": 3.903519630432129, + "reward_std": 0.30479004979133606, + "rewards/reward_fn/mean": 3.903519630432129, + "rewards/reward_fn/std": 0.30479007959365845, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 81.15625, + "completions/mean_terminated_length": 81.15625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.09389074950918119, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.012466374108043965, + "learning_rate": 6.3760000000000004e-06, + "loss": 0.0005, + "num_tokens": 19088173.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 77.375, + "completions/mean_terminated_length": 77.375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.09400623628594526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.019125153296045028, + "learning_rate": 6.3739999999999995e-06, + "loss": 0.0008, + "num_tokens": 19113977.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 56.34375, + "completions/mean_terminated_length": 56.34375, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.09412172306270931, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.59375, + "kl": 0.011532810429343954, + "learning_rate": 6.371999999999999e-06, + "loss": 0.0005, + "num_tokens": 19139652.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 103.0, + "completions/mean_length": 78.9375, + "completions/mean_terminated_length": 78.9375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.09423720983947338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.008130366033583414, + "learning_rate": 6.37e-06, + "loss": 0.0003, + "num_tokens": 19151746.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 99.3125, + "completions/mean_terminated_length": 99.3125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.09435269661623744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.01596155333390925, + "learning_rate": 6.368e-06, + "loss": 0.0006, + "num_tokens": 19172780.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 494.96875, + "completions/mean_terminated_length": 494.96875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.0944681833930015, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.013329807043191977, + "learning_rate": 6.366e-06, + "loss": 0.0005, + "num_tokens": 19203787.0, + "reward": 3.5469470024108887, + "reward_std": 0.8941089510917664, + "rewards/reward_fn/mean": 3.5469470024108887, + "rewards/reward_fn/std": 0.8941090106964111, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 116.46875, + "completions/mean_terminated_length": 116.46875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.09458367016976556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.010731410766311456, + "learning_rate": 6.3639999999999995e-06, + "loss": 0.0004, + "num_tokens": 19221658.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 209.5625, + "completions/mean_terminated_length": 209.5625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.09469915694652963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.01561382292129565, + "learning_rate": 6.361999999999999e-06, + "loss": 0.0006, + "num_tokens": 19240780.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 104.4375, + "completions/mean_terminated_length": 104.4375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.09481464372329368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.019518994260579348, + "learning_rate": 6.36e-06, + "loss": 0.0008, + "num_tokens": 19269594.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.09493013050005775, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.01482544039026834, + "learning_rate": 6.357999999999999e-06, + "loss": 0.0006, + "num_tokens": 19299102.0, + "reward": 3.9319405555725098, + "reward_std": 0.38500162959098816, + "rewards/reward_fn/mean": 3.9319405555725098, + "rewards/reward_fn/std": 0.38500159978866577, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 497.59375, + "completions/mean_terminated_length": 497.59375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.0950456172768218, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.01458668225677684, + "learning_rate": 6.356e-06, + "loss": 0.0006, + "num_tokens": 19335601.0, + "reward": 2.5710806846618652, + "reward_std": 0.6639242172241211, + "rewards/reward_fn/mean": 2.5710806846618652, + "rewards/reward_fn/std": 0.6639242172241211, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 150.90625, + "completions/mean_terminated_length": 150.90625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.09516110405358587, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.019857354753185064, + "learning_rate": 6.354e-06, + "loss": 0.0008, + "num_tokens": 19364462.0, + "reward": 3.7279295921325684, + "reward_std": 0.48047760128974915, + "rewards/reward_fn/mean": 3.7279295921325684, + "rewards/reward_fn/std": 0.48047763109207153, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 70.09375, + "completions/mean_terminated_length": 70.09375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.09527659083034992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.007507075621106196, + "learning_rate": 6.352e-06, + "loss": 0.0003, + "num_tokens": 19378481.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 139.125, + "completions/mean_terminated_length": 139.125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.09539207760711399, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.016080227127531543, + "learning_rate": 6.349999999999999e-06, + "loss": 0.0006, + "num_tokens": 19399029.0, + "reward": 3.431710720062256, + "reward_std": 0.4888460636138916, + "rewards/reward_fn/mean": 3.431710720062256, + "rewards/reward_fn/std": 0.4888461232185364, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 221.5, + "completions/mean_terminated_length": 221.5, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.09550756438387804, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.022121511516161263, + "learning_rate": 6.347999999999999e-06, + "loss": 0.0009, + "num_tokens": 19425829.0, + "reward": 2.9790596961975098, + "reward_std": 0.23674193024635315, + "rewards/reward_fn/mean": 2.9790596961975098, + "rewards/reward_fn/std": 0.23674198985099792, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 279.375, + "completions/mean_terminated_length": 279.375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.09562305116064211, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.012671977514401078, + "learning_rate": 6.346e-06, + "loss": 0.0005, + "num_tokens": 19449809.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 105.15625, + "completions/mean_terminated_length": 105.15625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.09573853793740617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.01286827250442002, + "learning_rate": 6.344e-06, + "loss": 0.0005, + "num_tokens": 19469654.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 200.28125, + "completions/mean_terminated_length": 200.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.09585402471417023, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.018060587943182327, + "learning_rate": 6.342e-06, + "loss": 0.0007, + "num_tokens": 19495263.0, + "reward": 3.7826790809631348, + "reward_std": 0.5737784504890442, + "rewards/reward_fn/mean": 3.7826790809631348, + "rewards/reward_fn/std": 0.5737784504890442, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 252.625, + "completions/mean_terminated_length": 252.625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.09596951149093429, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.012398935781675391, + "learning_rate": 6.3399999999999994e-06, + "loss": 0.0005, + "num_tokens": 19514451.0, + "reward": 3.9302711486816406, + "reward_std": 0.3944462835788727, + "rewards/reward_fn/mean": 3.9302711486816406, + "rewards/reward_fn/std": 0.3944462835788727, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 185.6875, + "completions/mean_terminated_length": 185.6875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.09608499826769835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0546875, + "kl": 0.011429379810579121, + "learning_rate": 6.338e-06, + "loss": 0.0005, + "num_tokens": 19532457.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 338.375, + "completions/mean_terminated_length": 338.375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.09620048504446241, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.013261303072795272, + "learning_rate": 6.336e-06, + "loss": 0.0005, + "num_tokens": 19569333.0, + "reward": 3.2161712646484375, + "reward_std": 0.3873843550682068, + "rewards/reward_fn/mean": 3.2161712646484375, + "rewards/reward_fn/std": 0.3873843848705292, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 171.0625, + "completions/mean_terminated_length": 171.0625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.09631597182122648, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.013559857179643586, + "learning_rate": 6.333999999999999e-06, + "loss": 0.0005, + "num_tokens": 19597399.0, + "reward": 3.8474984169006348, + "reward_std": 0.46061569452285767, + "rewards/reward_fn/mean": 3.8474984169006348, + "rewards/reward_fn/std": 0.46061569452285767, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 66.46875, + "completions/mean_terminated_length": 66.46875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.09643145859799053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1982421875, + "kl": 0.022027654224075377, + "learning_rate": 6.332e-06, + "loss": 0.0009, + "num_tokens": 19617926.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 180.40625, + "completions/mean_terminated_length": 180.40625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.0965469453747546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.01522229571128264, + "learning_rate": 6.3299999999999995e-06, + "loss": 0.0006, + "num_tokens": 19645651.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 190.75, + "completions/mean_terminated_length": 190.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.09666243215151865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.014081569301197305, + "learning_rate": 6.328e-06, + "loss": 0.0006, + "num_tokens": 19668779.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 87.875, + "completions/mean_terminated_length": 87.875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.0967779189282827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.012619745335541666, + "learning_rate": 6.325999999999999e-06, + "loss": 0.0005, + "num_tokens": 19693223.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 262.78125, + "completions/mean_terminated_length": 262.78125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.09689340570504677, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.013187522141379304, + "learning_rate": 6.324e-06, + "loss": 0.0005, + "num_tokens": 19717856.0, + "reward": 3.677762508392334, + "reward_std": 0.7811967730522156, + "rewards/reward_fn/mean": 3.677762508392334, + "rewards/reward_fn/std": 0.7811967730522156, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 65.375, + "completions/mean_terminated_length": 65.375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.09700889248181083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.00960457759356359, + "learning_rate": 6.322e-06, + "loss": 0.0004, + "num_tokens": 19731724.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 167.03125, + "completions/mean_terminated_length": 167.03125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.0971243792585749, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.01644325979577843, + "learning_rate": 6.32e-06, + "loss": 0.0007, + "num_tokens": 19744717.0, + "reward": 3.3475115299224854, + "reward_std": 0.09477242082357407, + "rewards/reward_fn/mean": 3.3475115299224854, + "rewards/reward_fn/std": 0.09477242827415466, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 166.78125, + "completions/mean_terminated_length": 166.78125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.09723986603533895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.015235268365358934, + "learning_rate": 6.3179999999999995e-06, + "loss": 0.0006, + "num_tokens": 19766086.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 189.375, + "completions/mean_terminated_length": 189.375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.09735535281210302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.011340900193317793, + "learning_rate": 6.315999999999999e-06, + "loss": 0.0005, + "num_tokens": 19790770.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 185.59375, + "completions/mean_terminated_length": 185.59375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.09747083958886707, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.012231185304699466, + "learning_rate": 6.314e-06, + "loss": 0.0005, + "num_tokens": 19811429.0, + "reward": 3.929769992828369, + "reward_std": 0.3972814679145813, + "rewards/reward_fn/mean": 3.929769992828369, + "rewards/reward_fn/std": 0.3972814679145813, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 133.09375, + "completions/mean_terminated_length": 133.09375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.09758632636563114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.01456759832217358, + "learning_rate": 6.312e-06, + "loss": 0.0006, + "num_tokens": 19827176.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 429.71875, + "completions/mean_terminated_length": 429.71875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.09770181314239519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.014731207411387004, + "learning_rate": 6.31e-06, + "loss": 0.0006, + "num_tokens": 19863231.0, + "reward": 3.783034324645996, + "reward_std": 0.46154019236564636, + "rewards/reward_fn/mean": 3.783034324645996, + "rewards/reward_fn/std": 0.46154019236564636, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 132.90625, + "completions/mean_terminated_length": 132.90625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.09781729991915926, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0, + "kl": 0.017041653554770164, + "learning_rate": 6.3079999999999996e-06, + "loss": 0.0007, + "num_tokens": 19879068.0, + "reward": 3.889040470123291, + "reward_std": 0.2678828239440918, + "rewards/reward_fn/mean": 3.889040470123291, + "rewards/reward_fn/std": 0.2678828239440918, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.09793278669592331, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.011401785217458382, + "learning_rate": 6.3059999999999994e-06, + "loss": 0.0005, + "num_tokens": 19903012.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 109.96875, + "completions/mean_terminated_length": 109.96875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.09804827347268738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.010431371589220362, + "learning_rate": 6.304e-06, + "loss": 0.0004, + "num_tokens": 19929443.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 194.71875, + "completions/mean_terminated_length": 194.71875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.09816376024945143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049072265625, + "kl": 0.010950305630103685, + "learning_rate": 6.301999999999999e-06, + "loss": 0.0004, + "num_tokens": 19947034.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 97.09375, + "completions/mean_terminated_length": 97.09375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.0982792470262155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.010193260641244706, + "learning_rate": 6.3e-06, + "loss": 0.0004, + "num_tokens": 19962557.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 304.34375, + "completions/mean_terminated_length": 304.34375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.09839473380297956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.014624464471125975, + "learning_rate": 6.298e-06, + "loss": 0.0006, + "num_tokens": 19991752.0, + "reward": 2.879824638366699, + "reward_std": 0.060693684965372086, + "rewards/reward_fn/mean": 2.879824638366699, + "rewards/reward_fn/std": 0.0606936551630497, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 169.96875, + "completions/mean_terminated_length": 169.96875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.09851022057974362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.01834758212498855, + "learning_rate": 6.296e-06, + "loss": 0.0007, + "num_tokens": 20010503.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 340.96875, + "completions/mean_terminated_length": 340.96875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.09862570735650768, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.015957828596583568, + "learning_rate": 6.293999999999999e-06, + "loss": 0.0006, + "num_tokens": 20032838.0, + "reward": 3.2179245948791504, + "reward_std": 1.0979598760604858, + "rewards/reward_fn/mean": 3.2179245948791504, + "rewards/reward_fn/std": 1.0979598760604858, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 156.5625, + "completions/mean_terminated_length": 156.5625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.09874119413327175, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.014793873750022613, + "learning_rate": 6.291999999999999e-06, + "loss": 0.0006, + "num_tokens": 20058680.0, + "reward": 3.7969508171081543, + "reward_std": 0.43189069628715515, + "rewards/reward_fn/mean": 3.7969508171081543, + "rewards/reward_fn/std": 0.4318907558917999, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 57.53125, + "completions/mean_terminated_length": 57.53125, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.0988566809100358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.010621336106851231, + "learning_rate": 6.29e-06, + "loss": 0.0004, + "num_tokens": 20072681.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 83.59375, + "completions/mean_terminated_length": 83.59375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.09897216768679987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.006320531272649532, + "learning_rate": 6.288e-06, + "loss": 0.0003, + "num_tokens": 20090332.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 140.125, + "completions/mean_terminated_length": 140.125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.09908765446356392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.018125894479453564, + "learning_rate": 6.286e-06, + "loss": 0.0007, + "num_tokens": 20105632.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 109.25, + "completions/mean_terminated_length": 109.25, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.09920314124032799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.01746356292278506, + "learning_rate": 6.2839999999999995e-06, + "loss": 0.0007, + "num_tokens": 20125928.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 95.0, + "completions/mean_terminated_length": 95.0, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.09931862801709204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.00620613189676078, + "learning_rate": 6.282e-06, + "loss": 0.0002, + "num_tokens": 20144456.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 201.90625, + "completions/mean_terminated_length": 201.90625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.09943411479385611, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.02737025049282238, + "learning_rate": 6.28e-06, + "loss": 0.0011, + "num_tokens": 20173157.0, + "reward": 3.3282299041748047, + "reward_std": 0.9698663949966431, + "rewards/reward_fn/mean": 3.3282299041748047, + "rewards/reward_fn/std": 0.9698663353919983, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1069.0, + "completions/max_terminated_length": 1069.0, + "completions/mean_length": 458.0, + "completions/mean_terminated_length": 458.0, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.09954960157062016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.014474883413640782, + "learning_rate": 6.277999999999999e-06, + "loss": 0.0006, + "num_tokens": 20199909.0, + "reward": 3.783350706100464, + "reward_std": 0.6843929290771484, + "rewards/reward_fn/mean": 3.783350706100464, + "rewards/reward_fn/std": 0.6843928694725037, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 145.65625, + "completions/mean_terminated_length": 145.65625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.09966508834738423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.016093967395136133, + "learning_rate": 6.276e-06, + "loss": 0.0006, + "num_tokens": 20225242.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 91.125, + "completions/mean_terminated_length": 91.125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.09978057512414829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.0151550910086371, + "learning_rate": 6.2739999999999996e-06, + "loss": 0.0006, + "num_tokens": 20250686.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 168.59375, + "completions/mean_terminated_length": 168.59375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.09989606190091234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.011958709714235738, + "learning_rate": 6.272e-06, + "loss": 0.0005, + "num_tokens": 20265713.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 160.59375, + "completions/mean_terminated_length": 160.59375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.1000115486776764, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.029310211684787646, + "learning_rate": 6.269999999999999e-06, + "loss": 0.0012, + "num_tokens": 20294308.0, + "reward": 3.286797046661377, + "reward_std": 0.2193298190832138, + "rewards/reward_fn/mean": 3.286797046661377, + "rewards/reward_fn/std": 0.219329833984375, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 275.1875, + "completions/mean_terminated_length": 275.1875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.10012703545444046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.011267786176176742, + "learning_rate": 6.267999999999999e-06, + "loss": 0.0005, + "num_tokens": 20318314.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 92.15625, + "completions/mean_terminated_length": 92.15625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.10024252223120453, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.014529438383760862, + "learning_rate": 6.266e-06, + "loss": 0.0006, + "num_tokens": 20335983.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 139.03125, + "completions/mean_terminated_length": 139.03125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.10035800900796858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.013357410905882716, + "learning_rate": 6.264e-06, + "loss": 0.0005, + "num_tokens": 20352720.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.10047349578473265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.016021164366975427, + "learning_rate": 6.2619999999999995e-06, + "loss": 0.0006, + "num_tokens": 20373781.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 98.5, + "completions/mean_terminated_length": 98.5, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.1005889825614967, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.625, + "kl": 0.016920383139222395, + "learning_rate": 6.259999999999999e-06, + "loss": 0.0007, + "num_tokens": 20395429.0, + "reward": 3.3607900142669678, + "reward_std": 0.43916383385658264, + "rewards/reward_fn/mean": 3.3607900142669678, + "rewards/reward_fn/std": 0.43916383385658264, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 94.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 73.40625, + "completions/mean_terminated_length": 73.40625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.10070446933826077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.009525153658614727, + "learning_rate": 6.258e-06, + "loss": 0.0004, + "num_tokens": 20416146.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 359.8125, + "completions/mean_terminated_length": 359.8125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.10081995611502483, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.012805348116671667, + "learning_rate": 6.256e-06, + "loss": 0.0005, + "num_tokens": 20442668.0, + "reward": 3.92881178855896, + "reward_std": 0.4027016758918762, + "rewards/reward_fn/mean": 3.92881178855896, + "rewards/reward_fn/std": 0.4027017056941986, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 131.375, + "completions/mean_terminated_length": 131.375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.10093544289178889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.01008286613796372, + "learning_rate": 6.253999999999999e-06, + "loss": 0.0004, + "num_tokens": 20459800.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 123.03125, + "completions/mean_terminated_length": 123.03125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.10105092966855295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.018389060132903978, + "learning_rate": 6.252e-06, + "loss": 0.0007, + "num_tokens": 20476345.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 143.5625, + "completions/mean_terminated_length": 143.5625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.10116641644531701, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.01841755632631248, + "learning_rate": 6.2499999999999995e-06, + "loss": 0.0007, + "num_tokens": 20500363.0, + "reward": 3.185171604156494, + "reward_std": 0.6582365036010742, + "rewards/reward_fn/mean": 3.185171604156494, + "rewards/reward_fn/std": 0.6582364439964294, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 90.53125, + "completions/mean_terminated_length": 90.53125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.10128190322208107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.008989409450805397, + "learning_rate": 6.248e-06, + "loss": 0.0004, + "num_tokens": 20514652.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 242.0, + "completions/mean_terminated_length": 242.0, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.10139738999884514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.014824352678260766, + "learning_rate": 6.246e-06, + "loss": 0.0006, + "num_tokens": 20537084.0, + "reward": 3.9239702224731445, + "reward_std": 0.2992008328437805, + "rewards/reward_fn/mean": 3.9239702224731445, + "rewards/reward_fn/std": 0.29920077323913574, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 97.84375, + "completions/mean_terminated_length": 97.84375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.10151287677560919, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.015266518268617801, + "learning_rate": 6.244e-06, + "loss": 0.0006, + "num_tokens": 20553783.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1778.0, + "completions/max_terminated_length": 1778.0, + "completions/mean_length": 613.28125, + "completions/mean_terminated_length": 613.28125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.10162836355237326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.01063481812889222, + "learning_rate": 6.242e-06, + "loss": 0.0004, + "num_tokens": 20588992.0, + "reward": 3.9230222702026367, + "reward_std": 0.4354526400566101, + "rewards/reward_fn/mean": 3.9230222702026367, + "rewards/reward_fn/std": 0.4354526400566101, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 252.8125, + "completions/mean_terminated_length": 252.8125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.10174385032913731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.009272656614484731, + "learning_rate": 6.2399999999999995e-06, + "loss": 0.0004, + "num_tokens": 20612346.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 131.0625, + "completions/mean_terminated_length": 131.0625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.10185933710590138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.010495665468624793, + "learning_rate": 6.238e-06, + "loss": 0.0004, + "num_tokens": 20627420.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 116.15625, + "completions/mean_terminated_length": 116.15625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.10197482388266543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.015763631439767778, + "learning_rate": 6.235999999999999e-06, + "loss": 0.0006, + "num_tokens": 20643201.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 109.65625, + "completions/mean_terminated_length": 109.65625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.1020903106594295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.006088869966333732, + "learning_rate": 6.234e-06, + "loss": 0.0002, + "num_tokens": 20659382.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 74.46875, + "completions/mean_terminated_length": 74.46875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.10220579743619355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.008484136065817438, + "learning_rate": 6.232e-06, + "loss": 0.0003, + "num_tokens": 20673157.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 83.71875, + "completions/mean_terminated_length": 83.71875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.10232128421295762, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.006659597842372023, + "learning_rate": 6.2300000000000005e-06, + "loss": 0.0003, + "num_tokens": 20690972.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 144.9375, + "completions/mean_terminated_length": 83.54838562011719, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.10243677098972168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.009419408524991013, + "learning_rate": 6.2279999999999995e-06, + "loss": 0.0004, + "num_tokens": 20712794.0, + "reward": 3.699533462524414, + "reward_std": 0.8917430639266968, + "rewards/reward_fn/mean": 3.699533462524414, + "rewards/reward_fn/std": 0.891743004322052, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 234.96875, + "completions/mean_terminated_length": 234.96875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.10255225776648574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.01054363044386264, + "learning_rate": 6.225999999999999e-06, + "loss": 0.0004, + "num_tokens": 20733369.0, + "reward": 3.687453269958496, + "reward_std": 0.4743165075778961, + "rewards/reward_fn/mean": 3.687453269958496, + "rewards/reward_fn/std": 0.4743165373802185, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 89.625, + "completions/mean_terminated_length": 89.625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.1026677445432498, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.010185687518969644, + "learning_rate": 6.224e-06, + "loss": 0.0004, + "num_tokens": 20752781.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 196.34375, + "completions/mean_terminated_length": 196.34375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.10278323132001387, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.00970074518409092, + "learning_rate": 6.222e-06, + "loss": 0.0004, + "num_tokens": 20779640.0, + "reward": 3.8972327709198, + "reward_std": 0.4007548987865448, + "rewards/reward_fn/mean": 3.8972327709198, + "rewards/reward_fn/std": 0.4007548987865448, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 157.09375, + "completions/mean_terminated_length": 157.09375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.10289871809677792, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.012493143854953814, + "learning_rate": 6.22e-06, + "loss": 0.0005, + "num_tokens": 20801371.0, + "reward": 3.978933572769165, + "reward_std": 0.11916936933994293, + "rewards/reward_fn/mean": 3.978933572769165, + "rewards/reward_fn/std": 0.11916932463645935, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 213.9375, + "completions/mean_terminated_length": 213.9375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.10301420487354197, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.010983062122249976, + "learning_rate": 6.218e-06, + "loss": 0.0004, + "num_tokens": 20821721.0, + "reward": 3.9818692207336426, + "reward_std": 0.10256348550319672, + "rewards/reward_fn/mean": 3.9818692207336426, + "rewards/reward_fn/std": 0.10256347060203552, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 147.8125, + "completions/mean_terminated_length": 147.8125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.10312969165030604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.013252874603495002, + "learning_rate": 6.216e-06, + "loss": 0.0005, + "num_tokens": 20849075.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 54.15625, + "completions/mean_terminated_length": 54.15625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.1032451784270701, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.01050054614097462, + "learning_rate": 6.214e-06, + "loss": 0.0004, + "num_tokens": 20868888.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 113.25, + "completions/mean_terminated_length": 113.25, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.10336066520383416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.004971454394762986, + "learning_rate": 6.211999999999999e-06, + "loss": 0.0002, + "num_tokens": 20891456.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 170.1875, + "completions/mean_terminated_length": 170.1875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.10347615198059822, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.01841776428045705, + "learning_rate": 6.21e-06, + "loss": 0.0007, + "num_tokens": 20917990.0, + "reward": 3.4971442222595215, + "reward_std": 0.09687189012765884, + "rewards/reward_fn/mean": 3.4971442222595215, + "rewards/reward_fn/std": 0.09687186777591705, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 168.03125, + "completions/mean_terminated_length": 168.03125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.10359163875736228, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.016337761713657528, + "learning_rate": 6.208e-06, + "loss": 0.0007, + "num_tokens": 20935847.0, + "reward": 3.794252872467041, + "reward_std": 0.4901490807533264, + "rewards/reward_fn/mean": 3.794252872467041, + "rewards/reward_fn/std": 0.49014902114868164, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 176.28125, + "completions/mean_terminated_length": 176.28125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.10370712553412634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.01674425831879489, + "learning_rate": 6.206e-06, + "loss": 0.0007, + "num_tokens": 20964752.0, + "reward": 3.9565725326538086, + "reward_std": 0.17261628806591034, + "rewards/reward_fn/mean": 3.9565725326538086, + "rewards/reward_fn/std": 0.17261630296707153, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 75.875, + "completions/mean_terminated_length": 75.875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.1038226123108904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.007264557360031176, + "learning_rate": 6.203999999999999e-06, + "loss": 0.0003, + "num_tokens": 20985644.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 197.90625, + "completions/mean_terminated_length": 197.90625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.10393809908765446, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.020535396528430283, + "learning_rate": 6.202e-06, + "loss": 0.0008, + "num_tokens": 21015017.0, + "reward": 3.9620487689971924, + "reward_std": 0.21468442678451538, + "rewards/reward_fn/mean": 3.9620487689971924, + "rewards/reward_fn/std": 0.2146844118833542, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 96.25, + "completions/mean_terminated_length": 96.25, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.10405358586441853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.166015625, + "kl": 0.020566764404065907, + "learning_rate": 6.2e-06, + "loss": 0.0008, + "num_tokens": 21038737.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 55.1875, + "completions/mean_terminated_length": 55.1875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.10416907264118258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.013673302528331988, + "learning_rate": 6.198e-06, + "loss": 0.0005, + "num_tokens": 21056791.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 70.90625, + "completions/mean_terminated_length": 70.90625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.10428455941794665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.009525138295430224, + "learning_rate": 6.196e-06, + "loss": 0.0004, + "num_tokens": 21074004.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 118.375, + "completions/mean_terminated_length": 118.375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.1044000461947107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.010624492198985536, + "learning_rate": 6.1939999999999995e-06, + "loss": 0.0004, + "num_tokens": 21090048.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 229.65625, + "completions/mean_terminated_length": 229.65625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.10451553297147477, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.009237694612238556, + "learning_rate": 6.192e-06, + "loss": 0.0004, + "num_tokens": 21113973.0, + "reward": 3.935791492462158, + "reward_std": 0.25576603412628174, + "rewards/reward_fn/mean": 3.935791492462158, + "rewards/reward_fn/std": 0.2557660639286041, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 191.6875, + "completions/mean_terminated_length": 191.6875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.10463101974823882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.01579863995721098, + "learning_rate": 6.19e-06, + "loss": 0.0006, + "num_tokens": 21137323.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 57.875, + "completions/mean_terminated_length": 57.875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.10474650652500289, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.006458301384554943, + "learning_rate": 6.187999999999999e-06, + "loss": 0.0003, + "num_tokens": 21148615.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 151.625, + "completions/mean_terminated_length": 151.625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.10486199330176695, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.0625, + "kl": 0.011507754999911413, + "learning_rate": 6.186e-06, + "loss": 0.0005, + "num_tokens": 21169723.0, + "reward": 2.143500328063965, + "reward_std": 1.925046682357788, + "rewards/reward_fn/mean": 2.143500328063965, + "rewards/reward_fn/std": 1.9250465631484985, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 79.4375, + "completions/mean_terminated_length": 79.4375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.10497748007853101, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.00546998928712128, + "learning_rate": 6.1839999999999996e-06, + "loss": 0.0002, + "num_tokens": 21185129.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 321.5, + "completions/mean_terminated_length": 321.5, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.10509296685529507, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.009200321030220948, + "learning_rate": 6.182e-06, + "loss": 0.0004, + "num_tokens": 21217945.0, + "reward": 3.90041446685791, + "reward_std": 0.43343329429626465, + "rewards/reward_fn/mean": 3.90041446685791, + "rewards/reward_fn/std": 0.4334333539009094, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 304.9375, + "completions/mean_terminated_length": 304.9375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.10520845363205913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.016113138772198, + "learning_rate": 6.179999999999999e-06, + "loss": 0.0006, + "num_tokens": 21239895.0, + "reward": 3.9368367195129395, + "reward_std": 0.24855200946331024, + "rewards/reward_fn/mean": 3.9368367195129395, + "rewards/reward_fn/std": 0.24855200946331024, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 98.78125, + "completions/mean_terminated_length": 98.78125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.10532394040882319, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.90625, + "kl": 0.018340273840294685, + "learning_rate": 6.178e-06, + "loss": 0.0007, + "num_tokens": 21255888.0, + "reward": 3.4762773513793945, + "reward_std": 0.6376193761825562, + "rewards/reward_fn/mean": 3.4762773513793945, + "rewards/reward_fn/std": 0.6376194357872009, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 235.875, + "completions/mean_terminated_length": 235.875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.10543942718558726, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.01617965495097451, + "learning_rate": 6.176e-06, + "loss": 0.0006, + "num_tokens": 21284780.0, + "reward": 3.943748950958252, + "reward_std": 0.22223447263240814, + "rewards/reward_fn/mean": 3.943748950958252, + "rewards/reward_fn/std": 0.22223448753356934, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 82.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 60.96875, + "completions/mean_terminated_length": 60.96875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.10555491396235131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.009356676746392623, + "learning_rate": 6.1740000000000005e-06, + "loss": 0.0004, + "num_tokens": 21298027.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 326.5625, + "completions/mean_terminated_length": 326.5625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.10567040073911538, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.01744942765799351, + "learning_rate": 6.1719999999999995e-06, + "loss": 0.0007, + "num_tokens": 21326717.0, + "reward": 2.9299020767211914, + "reward_std": 0.5429592132568359, + "rewards/reward_fn/mean": 2.9299020767211914, + "rewards/reward_fn/std": 0.5429592132568359, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 78.5625, + "completions/mean_terminated_length": 78.5625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.10578588751587943, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.010286152682965621, + "learning_rate": 6.169999999999999e-06, + "loss": 0.0004, + "num_tokens": 21341135.0, + "reward": 3.986166000366211, + "reward_std": 0.07825762033462524, + "rewards/reward_fn/mean": 3.986166000366211, + "rewards/reward_fn/std": 0.07825763523578644, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 319.53125, + "completions/mean_terminated_length": 319.53125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.1059013742926435, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.011087988779763691, + "learning_rate": 6.168e-06, + "loss": 0.0004, + "num_tokens": 21367104.0, + "reward": 3.856205940246582, + "reward_std": 0.5658271312713623, + "rewards/reward_fn/mean": 3.856205940246582, + "rewards/reward_fn/std": 0.5658270716667175, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 126.8125, + "completions/mean_terminated_length": 126.8125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.10601686106940755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.011873222079884727, + "learning_rate": 6.166e-06, + "loss": 0.0005, + "num_tokens": 21387034.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 158.5625, + "completions/mean_terminated_length": 158.5625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.1061323478461716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.014804100545006804, + "learning_rate": 6.164e-06, + "loss": 0.0006, + "num_tokens": 21402732.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 72.8125, + "completions/mean_terminated_length": 72.8125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.10624783462293567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.008521059920894913, + "learning_rate": 6.162e-06, + "loss": 0.0003, + "num_tokens": 21423142.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 108.71875, + "completions/mean_terminated_length": 108.71875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.10636332139969973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.008225335644965526, + "learning_rate": 6.1599999999999995e-06, + "loss": 0.0003, + "num_tokens": 21450845.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 177.4375, + "completions/mean_terminated_length": 177.4375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.1064788081764638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.011344082304276526, + "learning_rate": 6.158e-06, + "loss": 0.0005, + "num_tokens": 21474987.0, + "reward": 3.8679211139678955, + "reward_std": 0.4581357538700104, + "rewards/reward_fn/mean": 3.8679211139678955, + "rewards/reward_fn/std": 0.4581356942653656, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 84.3125, + "completions/mean_terminated_length": 84.3125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.10659429495322785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.01139878630056046, + "learning_rate": 6.155999999999999e-06, + "loss": 0.0005, + "num_tokens": 21488917.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 176.625, + "completions/mean_terminated_length": 176.625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.10670978172999192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.010374810037319548, + "learning_rate": 6.154e-06, + "loss": 0.0004, + "num_tokens": 21517033.0, + "reward": 3.800187349319458, + "reward_std": 0.8122778534889221, + "rewards/reward_fn/mean": 3.800187349319458, + "rewards/reward_fn/std": 0.8122777342796326, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 261.59375, + "completions/mean_terminated_length": 261.59375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.10682526850675597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.009804896879359148, + "learning_rate": 6.152e-06, + "loss": 0.0004, + "num_tokens": 21540636.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 93.84375, + "completions/mean_terminated_length": 93.84375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.10694075528352004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.014920402041752823, + "learning_rate": 6.15e-06, + "loss": 0.0006, + "num_tokens": 21562679.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 90.0625, + "completions/mean_terminated_length": 90.0625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.1070562420602841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.008518342698152992, + "learning_rate": 6.147999999999999e-06, + "loss": 0.0003, + "num_tokens": 21580537.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 268.65625, + "completions/mean_terminated_length": 268.65625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.10717172883704816, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.013522288529202342, + "learning_rate": 6.145999999999999e-06, + "loss": 0.0005, + "num_tokens": 21600206.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 66.125, + "completions/mean_terminated_length": 66.125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.10728721561381221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.013741929215029813, + "learning_rate": 6.144e-06, + "loss": 0.0005, + "num_tokens": 21618994.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 86.75, + "completions/mean_terminated_length": 86.75, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.10740270239057628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.01033076920430176, + "learning_rate": 6.142e-06, + "loss": 0.0004, + "num_tokens": 21638858.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 289.9375, + "completions/mean_terminated_length": 289.9375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.10751818916734034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.01690974361554254, + "learning_rate": 6.14e-06, + "loss": 0.0007, + "num_tokens": 21667560.0, + "reward": 3.716238498687744, + "reward_std": 0.714927613735199, + "rewards/reward_fn/mean": 3.716238498687744, + "rewards/reward_fn/std": 0.7149275541305542, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 86.375, + "completions/mean_terminated_length": 86.375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.1076336759441044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.010056151124445023, + "learning_rate": 6.1379999999999995e-06, + "loss": 0.0004, + "num_tokens": 21687604.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 231.15625, + "completions/mean_terminated_length": 231.15625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.10774916272086846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.018475955410394818, + "learning_rate": 6.136e-06, + "loss": 0.0007, + "num_tokens": 21713913.0, + "reward": 3.0165393352508545, + "reward_std": 0.08206525444984436, + "rewards/reward_fn/mean": 3.0165393352508545, + "rewards/reward_fn/std": 0.08206527680158615, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 85.96875, + "completions/mean_terminated_length": 85.96875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.10786464949763253, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.125, + "kl": 0.011339202370436396, + "learning_rate": 6.134e-06, + "loss": 0.0005, + "num_tokens": 21735640.0, + "reward": 3.005039691925049, + "reward_std": 0.060159046202898026, + "rewards/reward_fn/mean": 3.005039691925049, + "rewards/reward_fn/std": 0.06015905737876892, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 71.5625, + "completions/mean_terminated_length": 71.5625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.10798013627439658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.007357956266787369, + "learning_rate": 6.131999999999999e-06, + "loss": 0.0003, + "num_tokens": 21748810.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 53.21875, + "completions/mean_terminated_length": 53.21875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.10809562305116065, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.8125, + "kl": 0.016123360161145683, + "learning_rate": 6.13e-06, + "loss": 0.0006, + "num_tokens": 21771697.0, + "reward": 3.75, + "reward_std": 0.9837387204170227, + "rewards/reward_fn/mean": 3.75, + "rewards/reward_fn/std": 0.9837387204170227, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 81.53125, + "completions/mean_terminated_length": 81.53125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.1082111098279247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.014251143846195191, + "learning_rate": 6.128e-06, + "loss": 0.0006, + "num_tokens": 21796354.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 169.5, + "completions/mean_terminated_length": 169.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.10832659660468877, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.01955914788413793, + "learning_rate": 6.126e-06, + "loss": 0.0008, + "num_tokens": 21822642.0, + "reward": 3.673903703689575, + "reward_std": 0.4307182729244232, + "rewards/reward_fn/mean": 3.673903703689575, + "rewards/reward_fn/std": 0.4307182729244232, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 100.625, + "completions/mean_terminated_length": 100.625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.10844208338145282, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.010531369651289424, + "learning_rate": 6.123999999999999e-06, + "loss": 0.0004, + "num_tokens": 21847654.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 144.3125, + "completions/mean_terminated_length": 144.3125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.10855757015821689, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.014184991501679178, + "learning_rate": 6.122e-06, + "loss": 0.0006, + "num_tokens": 21863792.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 492.59375, + "completions/mean_terminated_length": 492.59375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.10867305693498094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.010647367569617927, + "learning_rate": 6.12e-06, + "loss": 0.0004, + "num_tokens": 21894627.0, + "reward": 3.7818198204040527, + "reward_std": 0.6893090605735779, + "rewards/reward_fn/mean": 3.7818198204040527, + "rewards/reward_fn/std": 0.6893090009689331, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 201.46875, + "completions/mean_terminated_length": 201.46875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.10878854371174501, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.020437859566300176, + "learning_rate": 6.118e-06, + "loss": 0.0008, + "num_tokens": 21923826.0, + "reward": 3.9487385749816895, + "reward_std": 0.20200006663799286, + "rewards/reward_fn/mean": 3.9487385749816895, + "rewards/reward_fn/std": 0.20200006663799286, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 287.59375, + "completions/mean_terminated_length": 287.59375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.10890403048850907, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.014564677228918299, + "learning_rate": 6.1159999999999995e-06, + "loss": 0.0006, + "num_tokens": 21945381.0, + "reward": 3.5085866451263428, + "reward_std": 0.9436915516853333, + "rewards/reward_fn/mean": 3.5085866451263428, + "rewards/reward_fn/std": 0.9436914920806885, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 202.46875, + "completions/mean_terminated_length": 202.46875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.10901951726527313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.014020295653608628, + "learning_rate": 6.113999999999999e-06, + "loss": 0.0006, + "num_tokens": 21964212.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 173.0, + "completions/mean_terminated_length": 173.0, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.10913500404203719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.02123874647077173, + "learning_rate": 6.112e-06, + "loss": 0.0008, + "num_tokens": 21985588.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 99.875, + "completions/mean_terminated_length": 99.875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.10925049081880124, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "kl": 0.02273062488529831, + "learning_rate": 6.11e-06, + "loss": 0.0009, + "num_tokens": 22016848.0, + "reward": 3.9407992362976074, + "reward_std": 0.2329985350370407, + "rewards/reward_fn/mean": 3.9407992362976074, + "rewards/reward_fn/std": 0.23299852013587952, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 233.34375, + "completions/mean_terminated_length": 233.34375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.10936597759556531, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.010848022779100575, + "learning_rate": 6.108e-06, + "loss": 0.0004, + "num_tokens": 22033147.0, + "reward": 3.1853089332580566, + "reward_std": 0.4791626036167145, + "rewards/reward_fn/mean": 3.1853089332580566, + "rewards/reward_fn/std": 0.47916263341903687, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 172.1875, + "completions/mean_terminated_length": 172.1875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.10948146437232936, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.018859497911762446, + "learning_rate": 6.106e-06, + "loss": 0.0008, + "num_tokens": 22057729.0, + "reward": 3.0882205963134766, + "reward_std": 0.06811744719743729, + "rewards/reward_fn/mean": 3.0882205963134766, + "rewards/reward_fn/std": 0.06811745464801788, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 245.375, + "completions/mean_terminated_length": 245.375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.10959695114909343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.009814556411583908, + "learning_rate": 6.1039999999999995e-06, + "loss": 0.0004, + "num_tokens": 22080973.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1951.0, + "completions/max_terminated_length": 1951.0, + "completions/mean_length": 476.5, + "completions/mean_terminated_length": 476.5, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.10971243792585748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.012096893682610244, + "learning_rate": 6.102e-06, + "loss": 0.0005, + "num_tokens": 22109597.0, + "reward": 3.214346170425415, + "reward_std": 0.6774078011512756, + "rewards/reward_fn/mean": 3.214346170425415, + "rewards/reward_fn/std": 0.6774077415466309, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 121.84375, + "completions/mean_terminated_length": 121.84375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.10982792470262155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.013024063286138698, + "learning_rate": 6.099999999999999e-06, + "loss": 0.0005, + "num_tokens": 22127096.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 114.53125, + "completions/mean_terminated_length": 114.53125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.1099434114793856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.024339174415217713, + "learning_rate": 6.098e-06, + "loss": 0.001, + "num_tokens": 22149769.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 60.25, + "completions/mean_terminated_length": 60.25, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.11005889825614967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2578125, + "kl": 0.022972058213781565, + "learning_rate": 6.096e-06, + "loss": 0.0009, + "num_tokens": 22159921.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 445.15625, + "completions/mean_terminated_length": 445.15625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.11017438503291373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044921875, + "kl": 0.01098047821142245, + "learning_rate": 6.0940000000000004e-06, + "loss": 0.0004, + "num_tokens": 22183766.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.1102898718096778, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.016587666876148432, + "learning_rate": 6.0919999999999994e-06, + "loss": 0.0007, + "num_tokens": 22213958.0, + "reward": 3.818416118621826, + "reward_std": 0.3903583884239197, + "rewards/reward_fn/mean": 3.818416118621826, + "rewards/reward_fn/std": 0.3903583586215973, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 188.125, + "completions/mean_terminated_length": 188.125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.11040535858644185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.012614606341230683, + "learning_rate": 6.089999999999999e-06, + "loss": 0.0005, + "num_tokens": 22234218.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 214.6875, + "completions/mean_terminated_length": 214.6875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.11052084536320592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.012762330574332736, + "learning_rate": 6.088e-06, + "loss": 0.0005, + "num_tokens": 22251104.0, + "reward": 3.8332042694091797, + "reward_std": 0.35293126106262207, + "rewards/reward_fn/mean": 3.8332042694091797, + "rewards/reward_fn/std": 0.35293126106262207, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 162.6875, + "completions/mean_terminated_length": 162.6875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.11063633213996997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.009375494715641253, + "learning_rate": 6.086e-06, + "loss": 0.0004, + "num_tokens": 22270710.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 88.875, + "completions/mean_terminated_length": 88.875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.11075181891673404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.011728120773113915, + "learning_rate": 6.084e-06, + "loss": 0.0005, + "num_tokens": 22285682.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 248.5625, + "completions/mean_terminated_length": 248.5625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.11086730569349809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.341796875, + "kl": 0.017133880202891305, + "learning_rate": 6.0819999999999995e-06, + "loss": 0.0007, + "num_tokens": 22304612.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 65.28125, + "completions/mean_terminated_length": 65.28125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.11098279247026216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.013374002439377364, + "learning_rate": 6.079999999999999e-06, + "loss": 0.0005, + "num_tokens": 22325613.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 435.875, + "completions/mean_terminated_length": 435.875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.11109827924702621, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.011197018480743282, + "learning_rate": 6.078e-06, + "loss": 0.0004, + "num_tokens": 22354889.0, + "reward": 3.9242758750915527, + "reward_std": 0.42835962772369385, + "rewards/reward_fn/mean": 3.9242758750915527, + "rewards/reward_fn/std": 0.4283595681190491, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 258.21875, + "completions/mean_terminated_length": 258.21875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.11121376602379028, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.016140008869115263, + "learning_rate": 6.075999999999999e-06, + "loss": 0.0006, + "num_tokens": 22384144.0, + "reward": 3.8325250148773193, + "reward_std": 0.32507988810539246, + "rewards/reward_fn/mean": 3.8325250148773193, + "rewards/reward_fn/std": 0.32507988810539246, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 230.75, + "completions/mean_terminated_length": 230.75, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.11132925280055433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.013224169975728728, + "learning_rate": 6.074e-06, + "loss": 0.0005, + "num_tokens": 22405864.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 160.65625, + "completions/mean_terminated_length": 160.65625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.1114447395773184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.013228306794189848, + "learning_rate": 6.072e-06, + "loss": 0.0005, + "num_tokens": 22426909.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 287.1875, + "completions/mean_terminated_length": 287.1875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.11156022635408246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.01544029064825736, + "learning_rate": 6.07e-06, + "loss": 0.0006, + "num_tokens": 22451075.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 221.5, + "completions/mean_terminated_length": 221.5, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.11167571313084652, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.015191307931672782, + "learning_rate": 6.067999999999999e-06, + "loss": 0.0006, + "num_tokens": 22481811.0, + "reward": 3.878885269165039, + "reward_std": 0.2940843105316162, + "rewards/reward_fn/mean": 3.878885269165039, + "rewards/reward_fn/std": 0.2940843105316162, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 89.28125, + "completions/mean_terminated_length": 89.28125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.11179119990761058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1611328125, + "kl": 0.028134553984273225, + "learning_rate": 6.065999999999999e-06, + "loss": 0.0011, + "num_tokens": 22507356.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 248.5625, + "completions/mean_terminated_length": 248.5625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.11190668668437465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.009390073704707902, + "learning_rate": 6.064e-06, + "loss": 0.0004, + "num_tokens": 22530478.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 148.125, + "completions/mean_terminated_length": 148.125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.1120221734611387, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.3125, + "kl": 0.060688267680234276, + "learning_rate": 6.062e-06, + "loss": 0.0024, + "num_tokens": 22547730.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 120.15625, + "completions/mean_terminated_length": 120.15625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.11213766023790277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.021048821043223143, + "learning_rate": 6.06e-06, + "loss": 0.0008, + "num_tokens": 22562487.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 177.21875, + "completions/mean_terminated_length": 177.21875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.11225314701466682, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.010124969856406096, + "learning_rate": 6.0579999999999994e-06, + "loss": 0.0004, + "num_tokens": 22586718.0, + "reward": 2.8315882682800293, + "reward_std": 0.33502691984176636, + "rewards/reward_fn/mean": 2.8315882682800293, + "rewards/reward_fn/std": 0.33502691984176636, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 93.71875, + "completions/mean_terminated_length": 93.71875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.11236863379143087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.011470326404378284, + "learning_rate": 6.056e-06, + "loss": 0.0005, + "num_tokens": 22608213.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 285.03125, + "completions/mean_terminated_length": 285.03125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.11248412056819494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.00973933530622162, + "learning_rate": 6.054e-06, + "loss": 0.0004, + "num_tokens": 22636918.0, + "reward": 2.789243698120117, + "reward_std": 0.02420063316822052, + "rewards/reward_fn/mean": 2.789243698120117, + "rewards/reward_fn/std": 0.024200623854994774, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 257.375, + "completions/mean_terminated_length": 257.375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.112599607344959, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.014932085308828391, + "learning_rate": 6.051999999999999e-06, + "loss": 0.0006, + "num_tokens": 22666690.0, + "reward": 3.9472081661224365, + "reward_std": 0.20799832046031952, + "rewards/reward_fn/mean": 3.9472081661224365, + "rewards/reward_fn/std": 0.20799829065799713, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 147.65625, + "completions/mean_terminated_length": 147.65625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.11271509412172306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.022451874494436197, + "learning_rate": 6.05e-06, + "loss": 0.0009, + "num_tokens": 22682391.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 73.6875, + "completions/mean_terminated_length": 73.6875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.11283058089848712, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.011362010289303726, + "learning_rate": 6.0479999999999995e-06, + "loss": 0.0005, + "num_tokens": 22697261.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 156.4375, + "completions/mean_terminated_length": 156.4375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.11294606767525119, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.012419821345247328, + "learning_rate": 6.046e-06, + "loss": 0.0005, + "num_tokens": 22717307.0, + "reward": 3.928504705429077, + "reward_std": 0.4044385254383087, + "rewards/reward_fn/mean": 3.928504705429077, + "rewards/reward_fn/std": 0.4044385850429535, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 169.21875, + "completions/mean_terminated_length": 169.21875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.11306155445201524, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.016352907463442534, + "learning_rate": 6.043999999999999e-06, + "loss": 0.0007, + "num_tokens": 22745218.0, + "reward": 3.807596206665039, + "reward_std": 0.3781861364841461, + "rewards/reward_fn/mean": 3.807596206665039, + "rewards/reward_fn/std": 0.3781861364841461, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 298.46875, + "completions/mean_terminated_length": 298.46875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.11317704122877931, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.01776290062116459, + "learning_rate": 6.042e-06, + "loss": 0.0007, + "num_tokens": 22771441.0, + "reward": 3.9106040000915527, + "reward_std": 0.28329986333847046, + "rewards/reward_fn/mean": 3.9106040000915527, + "rewards/reward_fn/std": 0.28329986333847046, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 405.375, + "completions/mean_terminated_length": 405.375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.11329252800554336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.01716940148617141, + "learning_rate": 6.04e-06, + "loss": 0.0007, + "num_tokens": 22800061.0, + "reward": 3.8512027263641357, + "reward_std": 0.5855107307434082, + "rewards/reward_fn/mean": 3.8512027263641357, + "rewards/reward_fn/std": 0.5855107307434082, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 162.46875, + "completions/mean_terminated_length": 162.46875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.11340801478230743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.01077553947106935, + "learning_rate": 6.038e-06, + "loss": 0.0004, + "num_tokens": 22820268.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 139.5625, + "completions/mean_terminated_length": 139.5625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.11352350155907148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.015901109247351997, + "learning_rate": 6.0359999999999995e-06, + "loss": 0.0006, + "num_tokens": 22836862.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 148.40625, + "completions/mean_terminated_length": 148.40625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.11363898833583555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.014877611727570184, + "learning_rate": 6.033999999999999e-06, + "loss": 0.0006, + "num_tokens": 22855211.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 367.90625, + "completions/mean_terminated_length": 367.90625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.1137544751125996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.011666855367366225, + "learning_rate": 6.032e-06, + "loss": 0.0005, + "num_tokens": 22883080.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 179.75, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.11386996188936367, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.02663693466456607, + "learning_rate": 6.03e-06, + "loss": 0.0011, + "num_tokens": 22909952.0, + "reward": 3.698793411254883, + "reward_std": 0.4930296242237091, + "rewards/reward_fn/mean": 3.698793411254883, + "rewards/reward_fn/std": 0.49302956461906433, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 183.90625, + "completions/mean_terminated_length": 183.90625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.11398544866612773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.013966089827590622, + "learning_rate": 6.028e-06, + "loss": 0.0006, + "num_tokens": 22927069.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 186.6875, + "completions/mean_terminated_length": 186.6875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.1141009354428918, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.014479032091912813, + "learning_rate": 6.0259999999999996e-06, + "loss": 0.0006, + "num_tokens": 22956307.0, + "reward": 2.7360899448394775, + "reward_std": 0.057223957031965256, + "rewards/reward_fn/mean": 2.7360899448394775, + "rewards/reward_fn/std": 0.057223957031965256, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 140.4375, + "completions/mean_terminated_length": 140.4375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.11421642221965585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.011442451825132594, + "learning_rate": 6.023999999999999e-06, + "loss": 0.0005, + "num_tokens": 22972257.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 142.5625, + "completions/mean_terminated_length": 142.5625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.11433190899641991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.015804221475264058, + "learning_rate": 6.022e-06, + "loss": 0.0006, + "num_tokens": 22994995.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 283.53125, + "completions/mean_terminated_length": 283.53125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.11444739577318397, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.01482641878828872, + "learning_rate": 6.019999999999999e-06, + "loss": 0.0006, + "num_tokens": 23025924.0, + "reward": 3.5292699337005615, + "reward_std": 0.43781358003616333, + "rewards/reward_fn/mean": 3.5292699337005615, + "rewards/reward_fn/std": 0.43781355023384094, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 160.03125, + "completions/mean_terminated_length": 160.03125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.11456288254994804, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.026341383927501738, + "learning_rate": 6.018e-06, + "loss": 0.0011, + "num_tokens": 23056773.0, + "reward": 3.951209545135498, + "reward_std": 0.19216616451740265, + "rewards/reward_fn/mean": 3.951209545135498, + "rewards/reward_fn/std": 0.19216614961624146, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 187.75, + "completions/mean_terminated_length": 187.75, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.11467836932671209, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.016345346986781806, + "learning_rate": 6.016e-06, + "loss": 0.0007, + "num_tokens": 23085117.0, + "reward": 3.928744077682495, + "reward_std": 0.4030844271183014, + "rewards/reward_fn/mean": 3.928744077682495, + "rewards/reward_fn/std": 0.403084397315979, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 392.03125, + "completions/mean_terminated_length": 392.03125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.11479385610347616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638671875, + "kl": 0.012677378486841917, + "learning_rate": 6.014e-06, + "loss": 0.0005, + "num_tokens": 23112830.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 247.65625, + "completions/mean_terminated_length": 247.65625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.11490934288024021, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.011970179111813195, + "learning_rate": 6.011999999999999e-06, + "loss": 0.0005, + "num_tokens": 23146035.0, + "reward": 2.893343925476074, + "reward_std": 0.07141231745481491, + "rewards/reward_fn/mean": 2.893343925476074, + "rewards/reward_fn/std": 0.07141232490539551, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 326.25, + "completions/mean_terminated_length": 326.25, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.11502482965700428, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.01735212885250803, + "learning_rate": 6.009999999999999e-06, + "loss": 0.0007, + "num_tokens": 23178459.0, + "reward": 2.891467571258545, + "reward_std": 0.4834327697753906, + "rewards/reward_fn/mean": 2.891467571258545, + "rewards/reward_fn/std": 0.48343273997306824, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 313.9375, + "completions/mean_terminated_length": 313.9375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.11514031643376833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049072265625, + "kl": 0.01166098659450654, + "learning_rate": 6.008e-06, + "loss": 0.0005, + "num_tokens": 23200057.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 190.21875, + "completions/mean_terminated_length": 190.21875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.1152558032105324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.011232808508793823, + "learning_rate": 6.006e-06, + "loss": 0.0004, + "num_tokens": 23218144.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 367.6875, + "completions/mean_terminated_length": 367.6875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.11537128998729645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.016581741074332967, + "learning_rate": 6.004e-06, + "loss": 0.0007, + "num_tokens": 23245142.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 160.90625, + "completions/mean_terminated_length": 160.90625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.11548677676406051, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.015029612637590617, + "learning_rate": 6.0019999999999995e-06, + "loss": 0.0006, + "num_tokens": 23262227.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 184.84375, + "completions/mean_terminated_length": 184.84375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.11560226354082458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.011602810249314643, + "learning_rate": 6e-06, + "loss": 0.0005, + "num_tokens": 23282830.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 69.65625, + "completions/mean_terminated_length": 69.65625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.11571775031758863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.010858666944841389, + "learning_rate": 5.998e-06, + "loss": 0.0004, + "num_tokens": 23303235.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 86.625, + "completions/mean_terminated_length": 86.625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.1158332370943527, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.010143851264729165, + "learning_rate": 5.996e-06, + "loss": 0.0004, + "num_tokens": 23327319.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 98.40625, + "completions/mean_terminated_length": 98.40625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.11594872387111675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.014794874470680952, + "learning_rate": 5.994e-06, + "loss": 0.0006, + "num_tokens": 23344516.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 105.28125, + "completions/mean_terminated_length": 105.28125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.11606421064788082, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.010532163450989174, + "learning_rate": 5.9919999999999996e-06, + "loss": 0.0004, + "num_tokens": 23370701.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 83.0, + "completions/mean_terminated_length": 83.0, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.11617969742464487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.012766816256771563, + "learning_rate": 5.99e-06, + "loss": 0.0005, + "num_tokens": 23385389.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 82.96875, + "completions/mean_terminated_length": 82.96875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.11629518420140894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.012524250174465124, + "learning_rate": 5.988e-06, + "loss": 0.0005, + "num_tokens": 23400396.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 263.53125, + "completions/mean_terminated_length": 263.53125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.116410670978173, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.022112914215540513, + "learning_rate": 5.985999999999999e-06, + "loss": 0.0009, + "num_tokens": 23418365.0, + "reward": 2.9769961833953857, + "reward_std": 0.22517287731170654, + "rewards/reward_fn/mean": 2.9769961833953857, + "rewards/reward_fn/std": 0.22517284750938416, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 206.78125, + "completions/mean_terminated_length": 206.78125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.11652615775493706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.010283544077537954, + "learning_rate": 5.984e-06, + "loss": 0.0004, + "num_tokens": 23440662.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 110.9375, + "completions/mean_terminated_length": 110.9375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.11664164453170112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.0054206209497351665, + "learning_rate": 5.982e-06, + "loss": 0.0002, + "num_tokens": 23466036.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 85.9375, + "completions/mean_terminated_length": 85.9375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.11675713130846518, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875, + "kl": 0.00980017137771938, + "learning_rate": 5.98e-06, + "loss": 0.0004, + "num_tokens": 23491154.0, + "reward": 2.82126522064209, + "reward_std": 0.053011950105428696, + "rewards/reward_fn/mean": 2.82126522064209, + "rewards/reward_fn/std": 0.05301191285252571, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 80.09375, + "completions/mean_terminated_length": 80.09375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.11687261808522924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.013274593628011644, + "learning_rate": 5.977999999999999e-06, + "loss": 0.0005, + "num_tokens": 23511061.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 119.8125, + "completions/mean_terminated_length": 119.8125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.1169881048619933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.01463321274786722, + "learning_rate": 5.976e-06, + "loss": 0.0006, + "num_tokens": 23526415.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 191.78125, + "completions/mean_terminated_length": 191.78125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.11710359163875736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.015214479994028807, + "learning_rate": 5.974e-06, + "loss": 0.0006, + "num_tokens": 23544712.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 208.09375, + "completions/mean_terminated_length": 208.09375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.11721907841552143, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.012559748036437668, + "learning_rate": 5.972e-06, + "loss": 0.0005, + "num_tokens": 23575563.0, + "reward": 3.707735061645508, + "reward_std": 0.6886378526687622, + "rewards/reward_fn/mean": 3.707735061645508, + "rewards/reward_fn/std": 0.6886378526687622, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 130.4375, + "completions/mean_terminated_length": 130.4375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.11733456519228548, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.01580134309187997, + "learning_rate": 5.97e-06, + "loss": 0.0006, + "num_tokens": 23596089.0, + "reward": 3.964085102081299, + "reward_std": 0.20316460728645325, + "rewards/reward_fn/mean": 3.964085102081299, + "rewards/reward_fn/std": 0.20316457748413086, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 211.78125, + "completions/mean_terminated_length": 211.78125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.11745005196904955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041748046875, + "kl": 0.00874817274598172, + "learning_rate": 5.9679999999999994e-06, + "loss": 0.0003, + "num_tokens": 23624434.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 171.4375, + "completions/mean_terminated_length": 171.4375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.1175655387458136, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.019130523141939193, + "learning_rate": 5.966e-06, + "loss": 0.0008, + "num_tokens": 23651136.0, + "reward": 3.859405517578125, + "reward_std": 0.5532437562942505, + "rewards/reward_fn/mean": 3.859405517578125, + "rewards/reward_fn/std": 0.5532437562942505, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 138.625, + "completions/mean_terminated_length": 138.625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.11768102552257767, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.023506541969254613, + "learning_rate": 5.964e-06, + "loss": 0.0009, + "num_tokens": 23678164.0, + "reward": 3.3937883377075195, + "reward_std": 0.48691076040267944, + "rewards/reward_fn/mean": 3.3937883377075195, + "rewards/reward_fn/std": 0.48691073060035706, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 114.1875, + "completions/mean_terminated_length": 114.1875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.11779651229934172, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.012019916546705645, + "learning_rate": 5.962e-06, + "loss": 0.0005, + "num_tokens": 23706138.0, + "reward": 3.9059207439422607, + "reward_std": 0.2972452938556671, + "rewards/reward_fn/mean": 3.9059207439422607, + "rewards/reward_fn/std": 0.2972452938556671, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 118.625, + "completions/mean_terminated_length": 118.625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.11791199907610579, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.015011178424174432, + "learning_rate": 5.96e-06, + "loss": 0.0006, + "num_tokens": 23729678.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 57.65625, + "completions/mean_terminated_length": 57.65625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.11802748585286985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2451171875, + "kl": 0.017891102739667986, + "learning_rate": 5.9579999999999995e-06, + "loss": 0.0007, + "num_tokens": 23740835.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 147.21875, + "completions/mean_terminated_length": 147.21875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.11814297262963391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.015884780921624042, + "learning_rate": 5.956e-06, + "loss": 0.0006, + "num_tokens": 23771242.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 382.03125, + "completions/mean_terminated_length": 382.03125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.11825845940639797, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.010889718876569532, + "learning_rate": 5.953999999999999e-06, + "loss": 0.0004, + "num_tokens": 23798923.0, + "reward": 3.928621768951416, + "reward_std": 0.4037759304046631, + "rewards/reward_fn/mean": 3.928621768951416, + "rewards/reward_fn/std": 0.4037759304046631, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 115.4375, + "completions/mean_terminated_length": 115.4375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.11837394618316203, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.018179040867835283, + "learning_rate": 5.952e-06, + "loss": 0.0007, + "num_tokens": 23819321.0, + "reward": 3.8997929096221924, + "reward_std": 0.42903637886047363, + "rewards/reward_fn/mean": 3.8997929096221924, + "rewards/reward_fn/std": 0.4290364384651184, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 178.21875, + "completions/mean_terminated_length": 178.21875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.11848943295992609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.011877153694513254, + "learning_rate": 5.95e-06, + "loss": 0.0005, + "num_tokens": 23837024.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 66.09375, + "completions/mean_terminated_length": 66.09375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.11860491973669014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.009853940195171162, + "learning_rate": 5.9480000000000005e-06, + "loss": 0.0004, + "num_tokens": 23858243.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 201.1875, + "completions/mean_terminated_length": 201.1875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.11872040651345421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.009775817161425948, + "learning_rate": 5.9459999999999995e-06, + "loss": 0.0004, + "num_tokens": 23890953.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 96.59375, + "completions/mean_terminated_length": 96.59375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.11883589329021826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.011001961131114513, + "learning_rate": 5.943999999999999e-06, + "loss": 0.0004, + "num_tokens": 23916220.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 145.25, + "completions/mean_terminated_length": 145.25, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.11895138006698233, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.012293027990381233, + "learning_rate": 5.942e-06, + "loss": 0.0005, + "num_tokens": 23941700.0, + "reward": 3.913696050643921, + "reward_std": 0.23209060728549957, + "rewards/reward_fn/mean": 3.913696050643921, + "rewards/reward_fn/std": 0.23209060728549957, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 366.71875, + "completions/mean_terminated_length": 366.71875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.11906686684374639, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.01344928372418508, + "learning_rate": 5.94e-06, + "loss": 0.0005, + "num_tokens": 23975323.0, + "reward": 3.92445707321167, + "reward_std": 0.42733582854270935, + "rewards/reward_fn/mean": 3.92445707321167, + "rewards/reward_fn/std": 0.42733582854270935, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 264.625, + "completions/mean_terminated_length": 264.625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.11918235362051045, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.02011718372523319, + "learning_rate": 5.938e-06, + "loss": 0.0008, + "num_tokens": 24006159.0, + "reward": 2.9134621620178223, + "reward_std": 0.05047595128417015, + "rewards/reward_fn/mean": 2.9134621620178223, + "rewards/reward_fn/std": 0.05047593265771866, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 132.3125, + "completions/mean_terminated_length": 132.3125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.11929784039727451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.01091788019402884, + "learning_rate": 5.936e-06, + "loss": 0.0004, + "num_tokens": 24033113.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 85.78125, + "completions/mean_terminated_length": 85.78125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.11941332717403857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2294921875, + "kl": 0.02318451725295745, + "learning_rate": 5.934e-06, + "loss": 0.0009, + "num_tokens": 24058098.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 130.1875, + "completions/mean_terminated_length": 130.1875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.11952881395080263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.016107278817798942, + "learning_rate": 5.932e-06, + "loss": 0.0006, + "num_tokens": 24075000.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 327.9375, + "completions/mean_terminated_length": 327.9375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.1196443007275667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.013515659753466025, + "learning_rate": 5.929999999999999e-06, + "loss": 0.0005, + "num_tokens": 24103958.0, + "reward": 3.905579090118408, + "reward_std": 0.42017892003059387, + "rewards/reward_fn/mean": 3.905579090118408, + "rewards/reward_fn/std": 0.4201788604259491, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 149.1875, + "completions/mean_terminated_length": 149.1875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.11975978750433075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.01670035655843094, + "learning_rate": 5.928e-06, + "loss": 0.0007, + "num_tokens": 24125852.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 102.71875, + "completions/mean_terminated_length": 102.71875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.11987527428109482, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.078125, + "kl": 0.010962027437926736, + "learning_rate": 5.926e-06, + "loss": 0.0004, + "num_tokens": 24141299.0, + "reward": 3.7052454948425293, + "reward_std": 0.4790739119052887, + "rewards/reward_fn/mean": 3.7052454948425293, + "rewards/reward_fn/std": 0.4790739119052887, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 206.0, + "completions/mean_terminated_length": 206.0, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.11999076105785887, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.012691079551586881, + "learning_rate": 5.924e-06, + "loss": 0.0005, + "num_tokens": 24171507.0, + "reward": 3.951962471008301, + "reward_std": 0.18903914093971252, + "rewards/reward_fn/mean": 3.951962471008301, + "rewards/reward_fn/std": 0.18903912603855133, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 113.25, + "completions/mean_terminated_length": 113.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.12010624783462294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.013259135303087533, + "learning_rate": 5.921999999999999e-06, + "loss": 0.0005, + "num_tokens": 24198395.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 136.28125, + "completions/mean_terminated_length": 136.28125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.120221734611387, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.015269479263224639, + "learning_rate": 5.92e-06, + "loss": 0.0006, + "num_tokens": 24214724.0, + "reward": 3.7938828468322754, + "reward_std": 0.6511263847351074, + "rewards/reward_fn/mean": 3.7938828468322754, + "rewards/reward_fn/std": 0.6511264443397522, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 80.40625, + "completions/mean_terminated_length": 80.40625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.12033722138815106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.006612933982978575, + "learning_rate": 5.918e-06, + "loss": 0.0003, + "num_tokens": 24237297.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 75.3125, + "completions/mean_terminated_length": 75.3125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.12045270816491511, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.006077978063331102, + "learning_rate": 5.916e-06, + "loss": 0.0002, + "num_tokens": 24254875.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 196.65625, + "completions/mean_terminated_length": 196.65625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.12056819494167918, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.011213234742172062, + "learning_rate": 5.914e-06, + "loss": 0.0004, + "num_tokens": 24285136.0, + "reward": 3.887362003326416, + "reward_std": 0.3091580867767334, + "rewards/reward_fn/mean": 3.887362003326416, + "rewards/reward_fn/std": 0.3091580867767334, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 347.28125, + "completions/mean_terminated_length": 347.28125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.12068368171844324, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.012162507453467697, + "learning_rate": 5.9119999999999995e-06, + "loss": 0.0005, + "num_tokens": 24318713.0, + "reward": 3.449303150177002, + "reward_std": 0.7649373412132263, + "rewards/reward_fn/mean": 3.449303150177002, + "rewards/reward_fn/std": 0.7649373412132263, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 66.40625, + "completions/mean_terminated_length": 66.40625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.1207991684952073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.010137548830243759, + "learning_rate": 5.91e-06, + "loss": 0.0004, + "num_tokens": 24336582.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 131.9375, + "completions/mean_terminated_length": 131.9375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.12091465527197136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.013054655079031363, + "learning_rate": 5.908e-06, + "loss": 0.0005, + "num_tokens": 24350532.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 127.28125, + "completions/mean_terminated_length": 127.28125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.12103014204873543, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.009683880758529995, + "learning_rate": 5.905999999999999e-06, + "loss": 0.0004, + "num_tokens": 24371821.0, + "reward": 3.975839853286743, + "reward_std": 0.13667024672031403, + "rewards/reward_fn/mean": 3.975839853286743, + "rewards/reward_fn/std": 0.13667021691799164, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 260.71875, + "completions/mean_terminated_length": 260.71875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.12114562882549948, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.01638141175499186, + "learning_rate": 5.904e-06, + "loss": 0.0007, + "num_tokens": 24396804.0, + "reward": 2.8638358116149902, + "reward_std": 0.05772831290960312, + "rewards/reward_fn/mean": 2.8638358116149902, + "rewards/reward_fn/std": 0.057728275656700134, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 217.96875, + "completions/mean_terminated_length": 217.96875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.12126111560226355, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.014403393273823895, + "learning_rate": 5.9019999999999996e-06, + "loss": 0.0006, + "num_tokens": 24412739.0, + "reward": 2.9835309982299805, + "reward_std": 0.07863600552082062, + "rewards/reward_fn/mean": 2.9835309982299805, + "rewards/reward_fn/std": 0.07863599807024002, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 310.90625, + "completions/mean_terminated_length": 310.90625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.1213766023790276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.01173450640635565, + "learning_rate": 5.9e-06, + "loss": 0.0005, + "num_tokens": 24432320.0, + "reward": 3.931988477706909, + "reward_std": 0.3847314119338989, + "rewards/reward_fn/mean": 3.931988477706909, + "rewards/reward_fn/std": 0.38473138213157654, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 76.34375, + "completions/mean_terminated_length": 76.34375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.12149208915579167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.012022874761896674, + "learning_rate": 5.897999999999999e-06, + "loss": 0.0005, + "num_tokens": 24445419.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 168.65625, + "completions/mean_terminated_length": 168.65625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.12160757593255572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.011488398085930385, + "learning_rate": 5.896e-06, + "loss": 0.0005, + "num_tokens": 24459776.0, + "reward": 3.9281649589538574, + "reward_std": 0.4063608646392822, + "rewards/reward_fn/mean": 3.9281649589538574, + "rewards/reward_fn/std": 0.4063608944416046, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 223.5625, + "completions/mean_terminated_length": 223.5625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.12172306270931978, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.012783557816874236, + "learning_rate": 5.894e-06, + "loss": 0.0005, + "num_tokens": 24479602.0, + "reward": 3.856703042984009, + "reward_std": 0.5638673901557922, + "rewards/reward_fn/mean": 3.856703042984009, + "rewards/reward_fn/std": 0.5638673901557922, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 217.03125, + "completions/mean_terminated_length": 217.03125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.12183854948608384, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.010736599579104222, + "learning_rate": 5.892e-06, + "loss": 0.0004, + "num_tokens": 24510611.0, + "reward": 3.785651206970215, + "reward_std": 0.620324432849884, + "rewards/reward_fn/mean": 3.785651206970215, + "rewards/reward_fn/std": 0.6203243732452393, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 216.875, + "completions/mean_terminated_length": 216.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.1219540362628479, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.012843923934269696, + "learning_rate": 5.8899999999999995e-06, + "loss": 0.0005, + "num_tokens": 24536847.0, + "reward": 3.8951003551483154, + "reward_std": 0.44940492510795593, + "rewards/reward_fn/mean": 3.8951003551483154, + "rewards/reward_fn/std": 0.44940486550331116, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 104.4375, + "completions/mean_terminated_length": 104.4375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.12206952303961197, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4375, + "kl": 0.010492712157429196, + "learning_rate": 5.887999999999999e-06, + "loss": 0.0004, + "num_tokens": 24563005.0, + "reward": 3.9308218955993652, + "reward_std": 0.39133110642433167, + "rewards/reward_fn/mean": 3.9308218955993652, + "rewards/reward_fn/std": 0.3913310766220093, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 140.09375, + "completions/mean_terminated_length": 140.09375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.12218500981637602, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.01090296870097518, + "learning_rate": 5.886e-06, + "loss": 0.0004, + "num_tokens": 24589984.0, + "reward": 3.6720895767211914, + "reward_std": 0.3340012729167938, + "rewards/reward_fn/mean": 3.6720895767211914, + "rewards/reward_fn/std": 0.3340012729167938, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 176.5, + "completions/mean_terminated_length": 176.5, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.12230049659314009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.010678680417186115, + "learning_rate": 5.884e-06, + "loss": 0.0004, + "num_tokens": 24611696.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 159.40625, + "completions/mean_terminated_length": 159.40625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.12241598336990414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.01296248821017798, + "learning_rate": 5.882e-06, + "loss": 0.0005, + "num_tokens": 24632893.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 311.53125, + "completions/mean_terminated_length": 311.53125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.12253147014666821, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.012928602052852511, + "learning_rate": 5.88e-06, + "loss": 0.0005, + "num_tokens": 24652398.0, + "reward": 3.9328291416168213, + "reward_std": 0.3799760937690735, + "rewards/reward_fn/mean": 3.9328291416168213, + "rewards/reward_fn/std": 0.3799760341644287, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 195.3125, + "completions/mean_terminated_length": 195.3125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.12264695692343226, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.006462225988798309, + "learning_rate": 5.8779999999999995e-06, + "loss": 0.0003, + "num_tokens": 24683224.0, + "reward": 3.973416805267334, + "reward_std": 0.15037687122821808, + "rewards/reward_fn/mean": 3.973416805267334, + "rewards/reward_fn/std": 0.1503768265247345, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 122.6875, + "completions/mean_terminated_length": 122.6875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.12276244370019633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11572265625, + "kl": 0.014061555804801174, + "learning_rate": 5.876e-06, + "loss": 0.0006, + "num_tokens": 24699662.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 87.21875, + "completions/mean_terminated_length": 87.21875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.12287793047696038, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.006096124832765781, + "learning_rate": 5.873999999999999e-06, + "loss": 0.0002, + "num_tokens": 24713365.0, + "reward": 3.9883368015289307, + "reward_std": 0.06597734242677689, + "rewards/reward_fn/mean": 3.9883368015289307, + "rewards/reward_fn/std": 0.06597734987735748, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1241.0, + "completions/max_terminated_length": 1241.0, + "completions/mean_length": 499.875, + "completions/mean_terminated_length": 499.875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.12299341725372445, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.012031370715703815, + "learning_rate": 5.872e-06, + "loss": 0.0005, + "num_tokens": 24737873.0, + "reward": 3.3607025146484375, + "reward_std": 0.8664507269859314, + "rewards/reward_fn/mean": 3.3607025146484375, + "rewards/reward_fn/std": 0.8664506673812866, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 86.625, + "completions/mean_terminated_length": 86.625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.1231089040304885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.011344177084538387, + "learning_rate": 5.87e-06, + "loss": 0.0005, + "num_tokens": 24760549.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 387.875, + "completions/mean_terminated_length": 387.875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.12322439080725257, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.01420929777668789, + "learning_rate": 5.868e-06, + "loss": 0.0006, + "num_tokens": 24785057.0, + "reward": 3.781047821044922, + "reward_std": 0.6916873455047607, + "rewards/reward_fn/mean": 3.781047821044922, + "rewards/reward_fn/std": 0.691687285900116, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 246.59375, + "completions/mean_terminated_length": 246.59375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.12333987758401663, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.012131209041399416, + "learning_rate": 5.865999999999999e-06, + "loss": 0.0005, + "num_tokens": 24823860.0, + "reward": 3.855374336242676, + "reward_std": 0.5691724419593811, + "rewards/reward_fn/mean": 3.855374336242676, + "rewards/reward_fn/std": 0.5691725015640259, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 349.375, + "completions/mean_terminated_length": 349.375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.1234553643607807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.012039704466587864, + "learning_rate": 5.863999999999999e-06, + "loss": 0.0005, + "num_tokens": 24844640.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 229.59375, + "completions/mean_terminated_length": 229.59375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.12357085113754475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.019341943501785863, + "learning_rate": 5.862e-06, + "loss": 0.0008, + "num_tokens": 24876403.0, + "reward": 3.333045482635498, + "reward_std": 0.4240139424800873, + "rewards/reward_fn/mean": 3.333045482635498, + "rewards/reward_fn/std": 0.4240139424800873, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 252.96875, + "completions/mean_terminated_length": 252.96875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.12368633791430882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.009295764983107802, + "learning_rate": 5.86e-06, + "loss": 0.0004, + "num_tokens": 24908658.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 190.125, + "completions/mean_terminated_length": 190.125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.12380182469107287, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.012234579553478397, + "learning_rate": 5.858e-06, + "loss": 0.0005, + "num_tokens": 24927414.0, + "reward": 3.404625415802002, + "reward_std": 0.6053003668785095, + "rewards/reward_fn/mean": 3.404625415802002, + "rewards/reward_fn/std": 0.6053003668785095, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 57.59375, + "completions/mean_terminated_length": 57.59375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.12391731146783694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.014087545503571164, + "learning_rate": 5.8559999999999995e-06, + "loss": 0.0006, + "num_tokens": 24946313.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 169.375, + "completions/mean_terminated_length": 169.375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.12403279824460099, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.015263688124832697, + "learning_rate": 5.854e-06, + "loss": 0.0006, + "num_tokens": 24965557.0, + "reward": 2.883086681365967, + "reward_std": 0.024274472147226334, + "rewards/reward_fn/mean": 2.883086681365967, + "rewards/reward_fn/std": 0.02427447773516178, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 262.25, + "completions/mean_terminated_length": 262.25, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.12414828502136506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.012660212218179367, + "learning_rate": 5.852e-06, + "loss": 0.0005, + "num_tokens": 24995485.0, + "reward": 3.8630154132843018, + "reward_std": 0.4766187369823456, + "rewards/reward_fn/mean": 3.8630154132843018, + "rewards/reward_fn/std": 0.4766187369823456, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 138.09375, + "completions/mean_terminated_length": 138.09375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.12426377179812911, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.014681433938676491, + "learning_rate": 5.849999999999999e-06, + "loss": 0.0006, + "num_tokens": 25024032.0, + "reward": 3.9306418895721436, + "reward_std": 0.3923487365245819, + "rewards/reward_fn/mean": 3.9306418895721436, + "rewards/reward_fn/std": 0.3923487663269043, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 127.25, + "completions/mean_terminated_length": 127.25, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.12437925857489318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.012382982822600752, + "learning_rate": 5.848e-06, + "loss": 0.0005, + "num_tokens": 25050024.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 167.125, + "completions/mean_terminated_length": 167.125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.12449474535165723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.013798808329738677, + "learning_rate": 5.846e-06, + "loss": 0.0006, + "num_tokens": 25067148.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 160.625, + "completions/mean_terminated_length": 160.625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.1246102321284213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.013122743883286603, + "learning_rate": 5.844e-06, + "loss": 0.0005, + "num_tokens": 25080256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 303.59375, + "completions/mean_terminated_length": 303.59375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.12472571890518536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.017571254720678553, + "learning_rate": 5.841999999999999e-06, + "loss": 0.0007, + "num_tokens": 25104915.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 306.1875, + "completions/mean_terminated_length": 306.1875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.12484120568194941, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.012665271176956594, + "learning_rate": 5.84e-06, + "loss": 0.0005, + "num_tokens": 25134809.0, + "reward": 3.086977005004883, + "reward_std": 0.7865707278251648, + "rewards/reward_fn/mean": 3.086977005004883, + "rewards/reward_fn/std": 0.78657066822052, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 163.34375, + "completions/mean_terminated_length": 163.34375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.12495669245871348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.010859614914807025, + "learning_rate": 5.838e-06, + "loss": 0.0004, + "num_tokens": 25159300.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 115.28125, + "completions/mean_terminated_length": 115.28125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.12507217923547753, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.014774845869396813, + "learning_rate": 5.836e-06, + "loss": 0.0006, + "num_tokens": 25174125.0, + "reward": 3.7605879306793213, + "reward_std": 0.3921864628791809, + "rewards/reward_fn/mean": 3.7605879306793213, + "rewards/reward_fn/std": 0.3921864926815033, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 246.9375, + "completions/mean_terminated_length": 246.9375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.1251876660122416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05712890625, + "kl": 0.012053570113494061, + "learning_rate": 5.8339999999999995e-06, + "loss": 0.0005, + "num_tokens": 25191595.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 209.1875, + "completions/mean_terminated_length": 209.1875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.12530315278900567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.01388838596176356, + "learning_rate": 5.831999999999999e-06, + "loss": 0.0006, + "num_tokens": 25212561.0, + "reward": 3.3785934448242188, + "reward_std": 0.6673559546470642, + "rewards/reward_fn/mean": 3.3785934448242188, + "rewards/reward_fn/std": 0.6673559546470642, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 251.09375, + "completions/mean_terminated_length": 251.09375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.1254186395657697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.012741786587866955, + "learning_rate": 5.83e-06, + "loss": 0.0005, + "num_tokens": 25240596.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 300.25, + "completions/mean_terminated_length": 300.25, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.12553412634253378, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.016160047249286436, + "learning_rate": 5.828e-06, + "loss": 0.0006, + "num_tokens": 25269628.0, + "reward": 3.2229721546173096, + "reward_std": 0.968970537185669, + "rewards/reward_fn/mean": 3.2229721546173096, + "rewards/reward_fn/std": 0.9689705967903137, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 156.0625, + "completions/mean_terminated_length": 156.0625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.12564961311929784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.014668018775410019, + "learning_rate": 5.825999999999999e-06, + "loss": 0.0006, + "num_tokens": 25287806.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 88.09375, + "completions/mean_terminated_length": 88.09375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.1257650998960619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.013210741220973432, + "learning_rate": 5.824e-06, + "loss": 0.0005, + "num_tokens": 25311553.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 147.8125, + "completions/mean_terminated_length": 147.8125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.12588058667282595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.015214875500532798, + "learning_rate": 5.8219999999999995e-06, + "loss": 0.0006, + "num_tokens": 25326651.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 128.90625, + "completions/mean_terminated_length": 128.90625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.12599607344959002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.014691114411107264, + "learning_rate": 5.82e-06, + "loss": 0.0006, + "num_tokens": 25343320.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 148.53125, + "completions/mean_terminated_length": 148.53125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.12611156022635409, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "kl": 0.014823583114775829, + "learning_rate": 5.817999999999999e-06, + "loss": 0.0006, + "num_tokens": 25358665.0, + "reward": 3.794793128967285, + "reward_std": 0.6483346819877625, + "rewards/reward_fn/mean": 3.794793128967285, + "rewards/reward_fn/std": 0.6483346223831177, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 181.59375, + "completions/mean_terminated_length": 181.59375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.12622704700311815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.020164625908364542, + "learning_rate": 5.816e-06, + "loss": 0.0008, + "num_tokens": 25389404.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 53.53125, + "completions/mean_terminated_length": 53.53125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.1263425337798822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.01045877464912337, + "learning_rate": 5.814e-06, + "loss": 0.0004, + "num_tokens": 25405069.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 166.65625, + "completions/mean_terminated_length": 166.65625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.12645802055664626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.302734375, + "kl": 0.023689146531978622, + "learning_rate": 5.8120000000000004e-06, + "loss": 0.0009, + "num_tokens": 25423106.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 205.0625, + "completions/mean_terminated_length": 205.0625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.12657350733341033, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.022174551471835002, + "learning_rate": 5.8099999999999994e-06, + "loss": 0.0009, + "num_tokens": 25452068.0, + "reward": 2.747878074645996, + "reward_std": 0.20286725461483002, + "rewards/reward_fn/mean": 2.747878074645996, + "rewards/reward_fn/std": 0.20286725461483002, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 70.46875, + "completions/mean_terminated_length": 70.46875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.1266889941101744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.007192667559138499, + "learning_rate": 5.807999999999999e-06, + "loss": 0.0003, + "num_tokens": 25473363.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 53.90625, + "completions/mean_terminated_length": 53.90625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.12680448088693844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.010710591144743375, + "learning_rate": 5.806e-06, + "loss": 0.0004, + "num_tokens": 25499792.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 212.53125, + "completions/mean_terminated_length": 212.53125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.1269199676637025, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.014996703845099546, + "learning_rate": 5.804e-06, + "loss": 0.0006, + "num_tokens": 25521889.0, + "reward": 3.824967384338379, + "reward_std": 0.4703097641468048, + "rewards/reward_fn/mean": 3.824967384338379, + "rewards/reward_fn/std": 0.4703097343444824, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 225.59375, + "completions/mean_terminated_length": 225.59375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.12703545444046657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.011773883292335086, + "learning_rate": 5.802e-06, + "loss": 0.0005, + "num_tokens": 25548532.0, + "reward": 3.900148868560791, + "reward_std": 0.4362471103668213, + "rewards/reward_fn/mean": 3.900148868560791, + "rewards/reward_fn/std": 0.4362470805644989, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 76.09375, + "completions/mean_terminated_length": 76.09375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.12715094121723064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12255859375, + "kl": 0.011361697612301214, + "learning_rate": 5.7999999999999995e-06, + "loss": 0.0005, + "num_tokens": 25573783.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 212.71875, + "completions/mean_terminated_length": 212.71875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.12726642799399468, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.014555666537489742, + "learning_rate": 5.797999999999999e-06, + "loss": 0.0006, + "num_tokens": 25596590.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 207.40625, + "completions/mean_terminated_length": 207.40625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.12738191477075875, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.012552703425171785, + "learning_rate": 5.796e-06, + "loss": 0.0005, + "num_tokens": 25616187.0, + "reward": 3.830740213394165, + "reward_std": 0.56941819190979, + "rewards/reward_fn/mean": 3.830740213394165, + "rewards/reward_fn/std": 0.56941819190979, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 76.84375, + "completions/mean_terminated_length": 76.84375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.12749740154752282, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.01043184274021769, + "learning_rate": 5.793999999999999e-06, + "loss": 0.0004, + "num_tokens": 25635798.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 144.625, + "completions/mean_terminated_length": 144.625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.12761288832428688, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.022014573056367226, + "learning_rate": 5.792e-06, + "loss": 0.0009, + "num_tokens": 25660970.0, + "reward": 3.857457160949707, + "reward_std": 0.34467586874961853, + "rewards/reward_fn/mean": 3.857457160949707, + "rewards/reward_fn/std": 0.3446758985519409, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 131.21875, + "completions/mean_terminated_length": 131.21875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.12772837510105092, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.015171921564615332, + "learning_rate": 5.79e-06, + "loss": 0.0006, + "num_tokens": 25682801.0, + "reward": 3.970525026321411, + "reward_std": 0.16673576831817627, + "rewards/reward_fn/mean": 3.970525026321411, + "rewards/reward_fn/std": 0.16673575341701508, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 103.5, + "completions/mean_terminated_length": 103.5, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.127843861877815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.013692118198378012, + "learning_rate": 5.788e-06, + "loss": 0.0005, + "num_tokens": 25707361.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 115.03125, + "completions/mean_terminated_length": 115.03125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.12795934865457906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.014710646013554651, + "learning_rate": 5.785999999999999e-06, + "loss": 0.0006, + "num_tokens": 25720418.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 223.15625, + "completions/mean_terminated_length": 223.15625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.1280748354313431, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.023592544806888327, + "learning_rate": 5.783999999999999e-06, + "loss": 0.0009, + "num_tokens": 25747495.0, + "reward": 2.9916930198669434, + "reward_std": 0.04631907120347023, + "rewards/reward_fn/mean": 2.9916930198669434, + "rewards/reward_fn/std": 0.04631907492876053, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 144.15625, + "completions/mean_terminated_length": 144.15625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.12819032220810717, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.008047458290093346, + "learning_rate": 5.782e-06, + "loss": 0.0003, + "num_tokens": 25774444.0, + "reward": 3.553483009338379, + "reward_std": 0.32421770691871643, + "rewards/reward_fn/mean": 3.553483009338379, + "rewards/reward_fn/std": 0.32421767711639404, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 130.5, + "completions/mean_terminated_length": 130.5, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.12830580898487123, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.01889216006384231, + "learning_rate": 5.78e-06, + "loss": 0.0008, + "num_tokens": 25808892.0, + "reward": 3.864591360092163, + "reward_std": 0.3237229287624359, + "rewards/reward_fn/mean": 3.864591360092163, + "rewards/reward_fn/std": 0.3237228989601135, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 277.28125, + "completions/mean_terminated_length": 277.28125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.1284212957616353, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.014911414793459699, + "learning_rate": 5.7779999999999996e-06, + "loss": 0.0006, + "num_tokens": 25838725.0, + "reward": 3.8158178329467773, + "reward_std": 0.4647669494152069, + "rewards/reward_fn/mean": 3.8158178329467773, + "rewards/reward_fn/std": 0.4647669196128845, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 138.59375, + "completions/mean_terminated_length": 138.59375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.12853678253839934, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.017053076458978467, + "learning_rate": 5.775999999999999e-06, + "loss": 0.0007, + "num_tokens": 25856472.0, + "reward": 3.684556007385254, + "reward_std": 0.7189526557922363, + "rewards/reward_fn/mean": 3.684556007385254, + "rewards/reward_fn/std": 0.7189525961875916, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 230.46875, + "completions/mean_terminated_length": 230.46875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.1286522693151634, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.01676173839950934, + "learning_rate": 5.774e-06, + "loss": 0.0007, + "num_tokens": 25886087.0, + "reward": 3.686138153076172, + "reward_std": 0.7707621455192566, + "rewards/reward_fn/mean": 3.686138153076172, + "rewards/reward_fn/std": 0.7707621455192566, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 85.09375, + "completions/mean_terminated_length": 85.09375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.12876775609192748, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875, + "kl": 0.022740017258911394, + "learning_rate": 5.772e-06, + "loss": 0.0009, + "num_tokens": 25900682.0, + "reward": 3.9703173637390137, + "reward_std": 0.16791120171546936, + "rewards/reward_fn/mean": 3.9703173637390137, + "rewards/reward_fn/std": 0.16791123151779175, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 221.0625, + "completions/mean_terminated_length": 221.0625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.12888324286869154, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.012431820592610165, + "learning_rate": 5.769999999999999e-06, + "loss": 0.0005, + "num_tokens": 25915596.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 67.15625, + "completions/mean_terminated_length": 67.15625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.12899872964545558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.009260186299798079, + "learning_rate": 5.768e-06, + "loss": 0.0004, + "num_tokens": 25935953.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 178.84375, + "completions/mean_terminated_length": 178.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.12911421642221965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.008937875943956897, + "learning_rate": 5.7659999999999995e-06, + "loss": 0.0004, + "num_tokens": 25963564.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 205.09375, + "completions/mean_terminated_length": 205.09375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.12922970319898372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.017250490593141876, + "learning_rate": 5.764e-06, + "loss": 0.0007, + "num_tokens": 25984303.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 77.84375, + "completions/mean_terminated_length": 77.84375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.1293451899757478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.009659904935688246, + "learning_rate": 5.761999999999999e-06, + "loss": 0.0004, + "num_tokens": 26008330.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 163.21875, + "completions/mean_terminated_length": 163.21875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.12946067675251183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.011986788653302938, + "learning_rate": 5.76e-06, + "loss": 0.0005, + "num_tokens": 26037809.0, + "reward": 3.928560733795166, + "reward_std": 0.4041209816932678, + "rewards/reward_fn/mean": 3.928560733795166, + "rewards/reward_fn/std": 0.40412095189094543, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 89.46875, + "completions/mean_terminated_length": 89.46875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.1295761635292759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.004385530244690017, + "learning_rate": 5.758e-06, + "loss": 0.0002, + "num_tokens": 26058816.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 99.40625, + "completions/mean_terminated_length": 99.40625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.12969165030603996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.006751351400453132, + "learning_rate": 5.756e-06, + "loss": 0.0003, + "num_tokens": 26079117.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.12980713708280403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.012903382448712364, + "learning_rate": 5.7539999999999995e-06, + "loss": 0.0005, + "num_tokens": 26103446.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 262.75, + "completions/mean_terminated_length": 262.75, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.12992262385956807, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.015065704414155334, + "learning_rate": 5.751999999999999e-06, + "loss": 0.0006, + "num_tokens": 26136494.0, + "reward": 2.775742769241333, + "reward_std": 0.2999240756034851, + "rewards/reward_fn/mean": 2.775742769241333, + "rewards/reward_fn/std": 0.2999240756034851, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 78.59375, + "completions/mean_terminated_length": 78.59375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.13003811063633214, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "kl": 0.011379029860108858, + "learning_rate": 5.75e-06, + "loss": 0.0005, + "num_tokens": 26147073.0, + "reward": 3.6223692893981934, + "reward_std": 0.05497672036290169, + "rewards/reward_fn/mean": 3.6223692893981934, + "rewards/reward_fn/std": 0.054976753890514374, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 73.78125, + "completions/mean_terminated_length": 73.78125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.1301535974130962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060791015625, + "kl": 0.0066353996808175, + "learning_rate": 5.748e-06, + "loss": 0.0003, + "num_tokens": 26162266.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 139.375, + "completions/mean_terminated_length": 139.375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.13026908418986027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.00948651189537486, + "learning_rate": 5.7460000000000006e-06, + "loss": 0.0004, + "num_tokens": 26184806.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 212.375, + "completions/mean_terminated_length": 212.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.1303845709666243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.010266729383147322, + "learning_rate": 5.7439999999999996e-06, + "loss": 0.0004, + "num_tokens": 26204050.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 134.28125, + "completions/mean_terminated_length": 134.28125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.13050005774338838, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.011971571511821821, + "learning_rate": 5.741999999999999e-06, + "loss": 0.0005, + "num_tokens": 26227579.0, + "reward": 3.1685292720794678, + "reward_std": 0.32370898127555847, + "rewards/reward_fn/mean": 3.1685292720794678, + "rewards/reward_fn/std": 0.3237089514732361, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 90.21875, + "completions/mean_terminated_length": 90.21875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.13061554452015245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.012732727598631755, + "learning_rate": 5.74e-06, + "loss": 0.0005, + "num_tokens": 26245506.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 214.78125, + "completions/mean_terminated_length": 214.78125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.13073103129691652, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.018410007993225008, + "learning_rate": 5.738e-06, + "loss": 0.0007, + "num_tokens": 26271035.0, + "reward": 3.1988308429718018, + "reward_std": 0.3730441629886627, + "rewards/reward_fn/mean": 3.1988308429718018, + "rewards/reward_fn/std": 0.37304413318634033, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 200.15625, + "completions/mean_terminated_length": 200.15625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.13084651807368056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.00913558292813832, + "learning_rate": 5.736e-06, + "loss": 0.0004, + "num_tokens": 26304096.0, + "reward": 3.8540027141571045, + "reward_std": 0.5745319128036499, + "rewards/reward_fn/mean": 3.8540027141571045, + "rewards/reward_fn/std": 0.5745319128036499, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 179.28125, + "completions/mean_terminated_length": 179.28125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.13096200485044462, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.009338133728306275, + "learning_rate": 5.734e-06, + "loss": 0.0004, + "num_tokens": 26334345.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 170.46875, + "completions/mean_terminated_length": 170.46875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.1310774916272087, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.012235570713528432, + "learning_rate": 5.732e-06, + "loss": 0.0005, + "num_tokens": 26358232.0, + "reward": 3.6211225986480713, + "reward_std": 0.6469292640686035, + "rewards/reward_fn/mean": 3.6211225986480713, + "rewards/reward_fn/std": 0.6469293236732483, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 217.78125, + "completions/mean_terminated_length": 217.78125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.13119297840397273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.015692261018557474, + "learning_rate": 5.73e-06, + "loss": 0.0006, + "num_tokens": 26377489.0, + "reward": 3.752195119857788, + "reward_std": 0.3770636022090912, + "rewards/reward_fn/mean": 3.752195119857788, + "rewards/reward_fn/std": 0.3770636022090912, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 400.6875, + "completions/mean_terminated_length": 400.6875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.1313084651807368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.007994081592187285, + "learning_rate": 5.727999999999999e-06, + "loss": 0.0003, + "num_tokens": 26409095.0, + "reward": 3.7372193336486816, + "reward_std": 0.6605976819992065, + "rewards/reward_fn/mean": 3.7372193336486816, + "rewards/reward_fn/std": 0.6605976819992065, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 255.40625, + "completions/mean_terminated_length": 255.40625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.13142395195750087, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.011410357859858777, + "learning_rate": 5.726e-06, + "loss": 0.0005, + "num_tokens": 26430324.0, + "reward": 3.9429008960723877, + "reward_std": 0.22469434142112732, + "rewards/reward_fn/mean": 3.9429008960723877, + "rewards/reward_fn/std": 0.2246943712234497, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 366.90625, + "completions/mean_terminated_length": 366.90625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.13153943873426494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.008210178981244098, + "learning_rate": 5.724e-06, + "loss": 0.0003, + "num_tokens": 26457489.0, + "reward": 3.9251058101654053, + "reward_std": 0.42366552352905273, + "rewards/reward_fn/mean": 3.9251058101654053, + "rewards/reward_fn/std": 0.42366546392440796, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1636.0, + "completions/max_terminated_length": 1636.0, + "completions/mean_length": 321.5, + "completions/mean_terminated_length": 321.5, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.13165492551102898, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.011519094579853117, + "learning_rate": 5.7220000000000004e-06, + "loss": 0.0005, + "num_tokens": 26481633.0, + "reward": 3.6668882369995117, + "reward_std": 0.5063958168029785, + "rewards/reward_fn/mean": 3.6668882369995117, + "rewards/reward_fn/std": 0.5063958168029785, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 49.0625, + "completions/mean_terminated_length": 49.0625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.13177041228779304, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3125, + "kl": 0.016287460774037754, + "learning_rate": 5.7199999999999994e-06, + "loss": 0.0007, + "num_tokens": 26494435.0, + "reward": 3.75, + "reward_std": 0.9837387204170227, + "rewards/reward_fn/mean": 3.75, + "rewards/reward_fn/std": 0.9837387204170227, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 97.46875, + "completions/mean_terminated_length": 97.46875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.1318858990645571, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.007677632893319242, + "learning_rate": 5.717999999999999e-06, + "loss": 0.0003, + "num_tokens": 26509426.0, + "reward": 3.971407413482666, + "reward_std": 0.16174329817295074, + "rewards/reward_fn/mean": 3.971407413482666, + "rewards/reward_fn/std": 0.16174329817295074, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 219.875, + "completions/mean_terminated_length": 219.875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.13200138584132118, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.019511776976287365, + "learning_rate": 5.716e-06, + "loss": 0.0008, + "num_tokens": 26535566.0, + "reward": 2.909608840942383, + "reward_std": 0.047967035323381424, + "rewards/reward_fn/mean": 2.909608840942383, + "rewards/reward_fn/std": 0.04796702042222023, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 103.1875, + "completions/mean_terminated_length": 103.1875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.13211687261808522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.216796875, + "kl": 0.027548391139134765, + "learning_rate": 5.714e-06, + "loss": 0.0011, + "num_tokens": 26556948.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 131.875, + "completions/mean_terminated_length": 131.875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.13223235939484929, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.012723823689157143, + "learning_rate": 5.712e-06, + "loss": 0.0005, + "num_tokens": 26573296.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 174.75, + "completions/mean_terminated_length": 174.75, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.13234784617161335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.013433435509796254, + "learning_rate": 5.7099999999999995e-06, + "loss": 0.0005, + "num_tokens": 26591496.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 308.71875, + "completions/mean_terminated_length": 308.71875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.13246333294837742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.02095324482070282, + "learning_rate": 5.708e-06, + "loss": 0.0008, + "num_tokens": 26612415.0, + "reward": 2.94525146484375, + "reward_std": 0.36160334944725037, + "rewards/reward_fn/mean": 2.94525146484375, + "rewards/reward_fn/std": 0.36160340905189514, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 118.71875, + "completions/mean_terminated_length": 118.71875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.13257881972514146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.0155329665140016, + "learning_rate": 5.706e-06, + "loss": 0.0006, + "num_tokens": 26626454.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.13269430650190553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.009535367775242776, + "learning_rate": 5.703999999999999e-06, + "loss": 0.0004, + "num_tokens": 26646598.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 93.4375, + "completions/mean_terminated_length": 93.4375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.1328097932786696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.015472709448658861, + "learning_rate": 5.702e-06, + "loss": 0.0006, + "num_tokens": 26676020.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.13292528005543366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.012645409369724803, + "learning_rate": 5.7e-06, + "loss": 0.0005, + "num_tokens": 26695137.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 119.96875, + "completions/mean_terminated_length": 119.96875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.1330407668321977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.01443312331684865, + "learning_rate": 5.698e-06, + "loss": 0.0006, + "num_tokens": 26720640.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 203.34375, + "completions/mean_terminated_length": 203.34375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.13315625360896177, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.010983346350258216, + "learning_rate": 5.695999999999999e-06, + "loss": 0.0004, + "num_tokens": 26744811.0, + "reward": 3.935849666595459, + "reward_std": 0.25257447361946106, + "rewards/reward_fn/mean": 3.935849666595459, + "rewards/reward_fn/std": 0.25257444381713867, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 180.53125, + "completions/mean_terminated_length": 180.53125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.13327174038572584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.015703022276284173, + "learning_rate": 5.694e-06, + "loss": 0.0006, + "num_tokens": 26770748.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 137.34375, + "completions/mean_terminated_length": 137.34375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.1333872271624899, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.014272584361606278, + "learning_rate": 5.692e-06, + "loss": 0.0006, + "num_tokens": 26793511.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 285.84375, + "completions/mean_terminated_length": 285.84375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.13350271393925395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.013406641643086914, + "learning_rate": 5.69e-06, + "loss": 0.0005, + "num_tokens": 26813890.0, + "reward": 3.856616973876953, + "reward_std": 0.5642713308334351, + "rewards/reward_fn/mean": 3.856616973876953, + "rewards/reward_fn/std": 0.5642712712287903, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 231.8125, + "completions/mean_terminated_length": 231.8125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.13361820071601802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.015184962889179587, + "learning_rate": 5.688e-06, + "loss": 0.0006, + "num_tokens": 26844412.0, + "reward": 3.7039430141448975, + "reward_std": 0.5357733368873596, + "rewards/reward_fn/mean": 3.7039430141448975, + "rewards/reward_fn/std": 0.5357732772827148, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 239.84375, + "completions/mean_terminated_length": 239.84375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.13373368749278208, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.015029308502562344, + "learning_rate": 5.6859999999999994e-06, + "loss": 0.0006, + "num_tokens": 26864535.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 103.125, + "completions/mean_terminated_length": 103.125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.13384917426954615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.016476073637022637, + "learning_rate": 5.684e-06, + "loss": 0.0007, + "num_tokens": 26884379.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 310.71875, + "completions/mean_terminated_length": 310.71875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.1339646610463102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.017325882625300437, + "learning_rate": 5.682e-06, + "loss": 0.0007, + "num_tokens": 26906642.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 244.125, + "completions/mean_terminated_length": 244.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.13408014782307426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.013895594573114067, + "learning_rate": 5.68e-06, + "loss": 0.0006, + "num_tokens": 26926838.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 95.78125, + "completions/mean_terminated_length": 95.78125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.13419563459983833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.01711915052146651, + "learning_rate": 5.678e-06, + "loss": 0.0007, + "num_tokens": 26941743.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 141.78125, + "completions/mean_terminated_length": 141.78125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.13431112137660237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.014945964881917462, + "learning_rate": 5.6759999999999995e-06, + "loss": 0.0006, + "num_tokens": 26964648.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 107.125, + "completions/mean_terminated_length": 107.125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.13442660815336643, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0625, + "kl": 0.007969792022777256, + "learning_rate": 5.674e-06, + "loss": 0.0003, + "num_tokens": 26984268.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 163.59375, + "completions/mean_terminated_length": 163.59375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.1345420949301305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.011707206242135726, + "learning_rate": 5.671999999999999e-06, + "loss": 0.0005, + "num_tokens": 27013023.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 197.625, + "completions/mean_terminated_length": 197.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.13465758170689457, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.01619508034491446, + "learning_rate": 5.67e-06, + "loss": 0.0006, + "num_tokens": 27032307.0, + "reward": 3.824347972869873, + "reward_std": 0.5742858648300171, + "rewards/reward_fn/mean": 3.824347972869873, + "rewards/reward_fn/std": 0.5742858648300171, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 309.21875, + "completions/mean_terminated_length": 309.21875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.1347730684836586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.012768591303029098, + "learning_rate": 5.668e-06, + "loss": 0.0005, + "num_tokens": 27065946.0, + "reward": 3.828765869140625, + "reward_std": 0.459762305021286, + "rewards/reward_fn/mean": 3.828765869140625, + "rewards/reward_fn/std": 0.459762305021286, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 96.8125, + "completions/mean_terminated_length": 96.8125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.13488855526042268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.0142617870515096, + "learning_rate": 5.6660000000000005e-06, + "loss": 0.0006, + "num_tokens": 27081172.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 134.9375, + "completions/mean_terminated_length": 134.9375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.13500404203718674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.010648405906977132, + "learning_rate": 5.6639999999999995e-06, + "loss": 0.0004, + "num_tokens": 27103026.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 105.84375, + "completions/mean_terminated_length": 105.84375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.1351195288139508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.011819019506219774, + "learning_rate": 5.661999999999999e-06, + "loss": 0.0005, + "num_tokens": 27118797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 136.5, + "completions/mean_terminated_length": 136.5, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.13523501559071485, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.012996308505535126, + "learning_rate": 5.66e-06, + "loss": 0.0005, + "num_tokens": 27137341.0, + "reward": 3.0184998512268066, + "reward_std": 0.048527851700782776, + "rewards/reward_fn/mean": 3.0184998512268066, + "rewards/reward_fn/std": 0.048527833074331284, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 195.0625, + "completions/mean_terminated_length": 195.0625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.13535050236747892, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.013537058577639982, + "learning_rate": 5.658e-06, + "loss": 0.0005, + "num_tokens": 27156223.0, + "reward": 3.9305355548858643, + "reward_std": 0.39295023679733276, + "rewards/reward_fn/mean": 3.9305355548858643, + "rewards/reward_fn/std": 0.39295023679733276, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 162.03125, + "completions/mean_terminated_length": 162.03125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.135465989144243, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.020973451479221694, + "learning_rate": 5.656e-06, + "loss": 0.0008, + "num_tokens": 27180000.0, + "reward": 3.802048683166504, + "reward_std": 0.4194432199001312, + "rewards/reward_fn/mean": 3.802048683166504, + "rewards/reward_fn/std": 0.4194432497024536, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 263.03125, + "completions/mean_terminated_length": 263.03125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.13558147592100706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.011589996152906679, + "learning_rate": 5.6539999999999996e-06, + "loss": 0.0005, + "num_tokens": 27203201.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 146.84375, + "completions/mean_terminated_length": 146.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.1356969626977711, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.01132364387740381, + "learning_rate": 5.652e-06, + "loss": 0.0005, + "num_tokens": 27225212.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 115.34375, + "completions/mean_terminated_length": 115.34375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.13581244947453516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.010936360580672044, + "learning_rate": 5.65e-06, + "loss": 0.0004, + "num_tokens": 27249095.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 199.0625, + "completions/mean_terminated_length": 199.0625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.13592793625129923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.010448087225086056, + "learning_rate": 5.647999999999999e-06, + "loss": 0.0004, + "num_tokens": 27273897.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1482.0, + "completions/max_terminated_length": 1482.0, + "completions/mean_length": 535.78125, + "completions/mean_terminated_length": 535.78125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.1360434230280633, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.014003406336996704, + "learning_rate": 5.646e-06, + "loss": 0.0006, + "num_tokens": 27309698.0, + "reward": 3.892928123474121, + "reward_std": 0.4521663784980774, + "rewards/reward_fn/mean": 3.892928123474121, + "rewards/reward_fn/std": 0.4521663784980774, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 340.59375, + "completions/mean_terminated_length": 340.59375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.13615890980482734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.009059360701940022, + "learning_rate": 5.644e-06, + "loss": 0.0004, + "num_tokens": 27335701.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 212.90625, + "completions/mean_terminated_length": 212.90625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.1362743965815914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.009228356575476937, + "learning_rate": 5.642e-06, + "loss": 0.0004, + "num_tokens": 27353266.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 135.4375, + "completions/mean_terminated_length": 135.4375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.13638988335835547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.012993660697247833, + "learning_rate": 5.639999999999999e-06, + "loss": 0.0005, + "num_tokens": 27376544.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 161.5, + "completions/mean_terminated_length": 161.5, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.13650537013511954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.010252449457766488, + "learning_rate": 5.638e-06, + "loss": 0.0004, + "num_tokens": 27401072.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 122.25, + "completions/mean_terminated_length": 122.25, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.13662085691188358, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.010713852614571806, + "learning_rate": 5.636e-06, + "loss": 0.0004, + "num_tokens": 27424248.0, + "reward": 3.9693074226379395, + "reward_std": 0.17362292110919952, + "rewards/reward_fn/mean": 3.9693074226379395, + "rewards/reward_fn/std": 0.17362292110919952, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 239.75, + "completions/mean_terminated_length": 239.75, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.13673634368864765, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.015307655339711346, + "learning_rate": 5.634e-06, + "loss": 0.0006, + "num_tokens": 27449776.0, + "reward": 3.9699149131774902, + "reward_std": 0.17018680274486542, + "rewards/reward_fn/mean": 3.9699149131774902, + "rewards/reward_fn/std": 0.17018677294254303, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 77.84375, + "completions/mean_terminated_length": 77.84375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.13685183046541172, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.008849916011968162, + "learning_rate": 5.632e-06, + "loss": 0.0004, + "num_tokens": 27469195.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 133.1875, + "completions/mean_terminated_length": 133.1875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.13696731724217578, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.013201818888774142, + "learning_rate": 5.6299999999999995e-06, + "loss": 0.0005, + "num_tokens": 27496369.0, + "reward": 3.9066519737243652, + "reward_std": 0.2950823903083801, + "rewards/reward_fn/mean": 3.9066519737243652, + "rewards/reward_fn/std": 0.2950823903083801, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 261.75, + "completions/mean_terminated_length": 261.75, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.13708280401893982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.01237387172295712, + "learning_rate": 5.628e-06, + "loss": 0.0005, + "num_tokens": 27517673.0, + "reward": 3.930698871612549, + "reward_std": 0.39202722907066345, + "rewards/reward_fn/mean": 3.930698871612549, + "rewards/reward_fn/std": 0.39202722907066345, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 185.375, + "completions/mean_terminated_length": 185.375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.1371982907957039, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.018309696912183426, + "learning_rate": 5.626e-06, + "loss": 0.0007, + "num_tokens": 27539541.0, + "reward": 3.9554147720336914, + "reward_std": 0.17674051225185394, + "rewards/reward_fn/mean": 3.9554147720336914, + "rewards/reward_fn/std": 0.17674051225185394, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 85.78125, + "completions/mean_terminated_length": 85.78125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.13731377757246796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.01272331263317028, + "learning_rate": 5.623999999999999e-06, + "loss": 0.0005, + "num_tokens": 27553806.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 193.625, + "completions/mean_terminated_length": 193.625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.137429264349232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.02032197522930801, + "learning_rate": 5.622e-06, + "loss": 0.0008, + "num_tokens": 27577346.0, + "reward": 3.1598808765411377, + "reward_std": 0.17219319939613342, + "rewards/reward_fn/mean": 3.1598808765411377, + "rewards/reward_fn/std": 0.17219318449497223, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 311.46875, + "completions/mean_terminated_length": 311.46875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.13754475112599607, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.014932343299733475, + "learning_rate": 5.6199999999999996e-06, + "loss": 0.0006, + "num_tokens": 27612305.0, + "reward": 2.9866251945495605, + "reward_std": 0.6515676379203796, + "rewards/reward_fn/mean": 2.9866251945495605, + "rewards/reward_fn/std": 0.6515675783157349, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 411.09375, + "completions/mean_terminated_length": 411.09375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.13766023790276014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.010395129502285272, + "learning_rate": 5.618e-06, + "loss": 0.0004, + "num_tokens": 27642420.0, + "reward": 3.825091600418091, + "reward_std": 0.5074405074119568, + "rewards/reward_fn/mean": 3.825091600418091, + "rewards/reward_fn/std": 0.5074405074119568, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 117.84375, + "completions/mean_terminated_length": 117.84375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.1377757246795242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.012267334968782961, + "learning_rate": 5.615999999999999e-06, + "loss": 0.0005, + "num_tokens": 27663215.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 283.5625, + "completions/mean_terminated_length": 283.5625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.13789121145628824, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.01117176451953128, + "learning_rate": 5.614e-06, + "loss": 0.0004, + "num_tokens": 27680353.0, + "reward": 3.945420742034912, + "reward_std": 0.21513108909130096, + "rewards/reward_fn/mean": 3.945420742034912, + "rewards/reward_fn/std": 0.21513108909130096, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 65.09375, + "completions/mean_terminated_length": 65.09375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.1380066982330523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.00841053396652569, + "learning_rate": 5.612e-06, + "loss": 0.0003, + "num_tokens": 27699876.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 218.0625, + "completions/mean_terminated_length": 218.0625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.13812218500981638, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.019289534160634503, + "learning_rate": 5.61e-06, + "loss": 0.0008, + "num_tokens": 27730694.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 233.6875, + "completions/mean_terminated_length": 233.6875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.13823767178658045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.017104809143347666, + "learning_rate": 5.6079999999999995e-06, + "loss": 0.0007, + "num_tokens": 27756636.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 88.875, + "completions/mean_terminated_length": 88.875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.13835315856334449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2314453125, + "kl": 0.027060958615038544, + "learning_rate": 5.605999999999999e-06, + "loss": 0.0011, + "num_tokens": 27782936.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 156.65625, + "completions/mean_terminated_length": 156.65625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.13846864534010855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.008181184348359238, + "learning_rate": 5.604e-06, + "loss": 0.0003, + "num_tokens": 27803981.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 212.40625, + "completions/mean_terminated_length": 212.40625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.13858413211687262, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.010855478372832295, + "learning_rate": 5.602e-06, + "loss": 0.0004, + "num_tokens": 27826266.0, + "reward": 3.8588552474975586, + "reward_std": 0.5553984642028809, + "rewards/reward_fn/mean": 3.8588552474975586, + "rewards/reward_fn/std": 0.5553984045982361, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 191.90625, + "completions/mean_terminated_length": 191.90625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.1386996188936367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.01044373596960213, + "learning_rate": 5.6e-06, + "loss": 0.0004, + "num_tokens": 27845143.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 56.0625, + "completions/mean_terminated_length": 56.0625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.13881510567040073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.01073633398846141, + "learning_rate": 5.598e-06, + "loss": 0.0004, + "num_tokens": 27870265.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 118.9375, + "completions/mean_terminated_length": 118.9375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.1389305924471648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.014878340371069498, + "learning_rate": 5.5959999999999994e-06, + "loss": 0.0006, + "num_tokens": 27886007.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 66.0, + "completions/mean_terminated_length": 66.0, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.13904607922392886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1640625, + "kl": 0.016107084244140424, + "learning_rate": 5.594e-06, + "loss": 0.0006, + "num_tokens": 27907959.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 192.84375, + "completions/mean_terminated_length": 192.84375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.13916156600069293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0546875, + "kl": 0.010539189592236653, + "learning_rate": 5.591999999999999e-06, + "loss": 0.0004, + "num_tokens": 27930738.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 123.75, + "completions/mean_terminated_length": 123.75, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.13927705277745697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.009415195774636231, + "learning_rate": 5.59e-06, + "loss": 0.0004, + "num_tokens": 27953834.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 194.40625, + "completions/mean_terminated_length": 194.40625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.13939253955422104, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.011217577615752816, + "learning_rate": 5.588e-06, + "loss": 0.0004, + "num_tokens": 27970167.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 83.46875, + "completions/mean_terminated_length": 83.46875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.1395080263309851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.013445775308355223, + "learning_rate": 5.586e-06, + "loss": 0.0005, + "num_tokens": 27990086.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 124.625, + "completions/mean_terminated_length": 124.625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.13962351310774918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.00885430190101033, + "learning_rate": 5.583999999999999e-06, + "loss": 0.0004, + "num_tokens": 28009498.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 161.03125, + "completions/mean_terminated_length": 161.03125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.13973899988451322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.008070733612839831, + "learning_rate": 5.581999999999999e-06, + "loss": 0.0003, + "num_tokens": 28031419.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 76.8125, + "completions/mean_terminated_length": 76.8125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.13985448666127728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.011076761737058405, + "learning_rate": 5.58e-06, + "loss": 0.0004, + "num_tokens": 28052661.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 206.90625, + "completions/mean_terminated_length": 206.90625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.13996997343804135, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.012962494816747494, + "learning_rate": 5.578e-06, + "loss": 0.0005, + "num_tokens": 28071826.0, + "reward": 3.9287467002868652, + "reward_std": 0.4030686914920807, + "rewards/reward_fn/mean": 3.9287467002868652, + "rewards/reward_fn/std": 0.4030686616897583, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 172.75, + "completions/mean_terminated_length": 172.75, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.14008546021480542, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.009065999336598907, + "learning_rate": 5.576e-06, + "loss": 0.0004, + "num_tokens": 28101546.0, + "reward": 3.3369336128234863, + "reward_std": 0.0476510226726532, + "rewards/reward_fn/mean": 3.3369336128234863, + "rewards/reward_fn/std": 0.04765104502439499, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 306.1875, + "completions/mean_terminated_length": 306.1875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.14020094699156946, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.016764901665737852, + "learning_rate": 5.5739999999999995e-06, + "loss": 0.0007, + "num_tokens": 28134416.0, + "reward": 2.797365188598633, + "reward_std": 0.3964698016643524, + "rewards/reward_fn/mean": 2.797365188598633, + "rewards/reward_fn/std": 0.39646974205970764, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 64.84375, + "completions/mean_terminated_length": 64.84375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.14031643376833353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051513671875, + "kl": 0.005304408708980191, + "learning_rate": 5.572e-06, + "loss": 0.0002, + "num_tokens": 28148203.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 185.9375, + "completions/mean_terminated_length": 185.9375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.1404319205450976, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.012620950990822166, + "learning_rate": 5.57e-06, + "loss": 0.0005, + "num_tokens": 28171625.0, + "reward": 3.9775004386901855, + "reward_std": 0.12727676331996918, + "rewards/reward_fn/mean": 3.9775004386901855, + "rewards/reward_fn/std": 0.12727677822113037, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 296.71875, + "completions/mean_terminated_length": 296.71875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.14054740732186163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.01054959419707302, + "learning_rate": 5.567999999999999e-06, + "loss": 0.0004, + "num_tokens": 28196128.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 317.6875, + "completions/mean_terminated_length": 317.6875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.1406628940986257, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.01367482567729894, + "learning_rate": 5.566e-06, + "loss": 0.0005, + "num_tokens": 28218774.0, + "reward": 3.794738292694092, + "reward_std": 0.6484237313270569, + "rewards/reward_fn/mean": 3.794738292694092, + "rewards/reward_fn/std": 0.6484237313270569, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 88.21875, + "completions/mean_terminated_length": 88.21875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.14077838087538977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.012851472398324404, + "learning_rate": 5.564e-06, + "loss": 0.0005, + "num_tokens": 28232701.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 133.75, + "completions/mean_terminated_length": 133.75, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.14089386765215384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.016066105716163293, + "learning_rate": 5.562e-06, + "loss": 0.0006, + "num_tokens": 28259093.0, + "reward": 3.9613194465637207, + "reward_std": 0.21881093084812164, + "rewards/reward_fn/mean": 3.9613194465637207, + "rewards/reward_fn/std": 0.21881093084812164, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 93.53125, + "completions/mean_terminated_length": 93.53125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.14100935442891788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.008883150570909493, + "learning_rate": 5.559999999999999e-06, + "loss": 0.0004, + "num_tokens": 28275814.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 311.71875, + "completions/mean_terminated_length": 311.71875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.14112484120568194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.234375, + "kl": 0.010949614879791625, + "learning_rate": 5.558e-06, + "loss": 0.0004, + "num_tokens": 28301437.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 232.34375, + "completions/mean_terminated_length": 232.34375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.141240327982446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.014829241888946854, + "learning_rate": 5.556e-06, + "loss": 0.0006, + "num_tokens": 28338536.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 109.375, + "completions/mean_terminated_length": 109.375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.14135581475921008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.012822462871554308, + "learning_rate": 5.554e-06, + "loss": 0.0005, + "num_tokens": 28360468.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 243.875, + "completions/mean_terminated_length": 243.875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.14147130153597412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.014947564079193398, + "learning_rate": 5.5519999999999995e-06, + "loss": 0.0006, + "num_tokens": 28393744.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 122.53125, + "completions/mean_terminated_length": 122.53125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.1415867883127382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.0066696081157715525, + "learning_rate": 5.549999999999999e-06, + "loss": 0.0003, + "num_tokens": 28410177.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 91.625, + "completions/mean_terminated_length": 91.625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.14170227508950226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05859375, + "kl": 0.008239683747888193, + "learning_rate": 5.548e-06, + "loss": 0.0003, + "num_tokens": 28425877.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 112.6875, + "completions/mean_terminated_length": 112.6875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.14181776186626632, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.006876435028971173, + "learning_rate": 5.546e-06, + "loss": 0.0003, + "num_tokens": 28451563.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 106.28125, + "completions/mean_terminated_length": 106.28125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.14193324864303036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.007599752183523378, + "learning_rate": 5.543999999999999e-06, + "loss": 0.0003, + "num_tokens": 28477396.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 175.375, + "completions/mean_terminated_length": 175.375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.14204873541979443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.00783907252480276, + "learning_rate": 5.542e-06, + "loss": 0.0003, + "num_tokens": 28500608.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 256.0625, + "completions/mean_terminated_length": 256.0625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.1421642221965585, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.012072291225194931, + "learning_rate": 5.5399999999999995e-06, + "loss": 0.0005, + "num_tokens": 28524962.0, + "reward": 3.931118965148926, + "reward_std": 0.3896503746509552, + "rewards/reward_fn/mean": 3.931118965148926, + "rewards/reward_fn/std": 0.3896503746509552, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 215.3125, + "completions/mean_terminated_length": 215.3125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.14227970897332257, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.01545518079365138, + "learning_rate": 5.538e-06, + "loss": 0.0006, + "num_tokens": 28542060.0, + "reward": 2.9678826332092285, + "reward_std": 0.04880320653319359, + "rewards/reward_fn/mean": 2.9678826332092285, + "rewards/reward_fn/std": 0.04880315065383911, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 110.0, + "completions/mean_terminated_length": 110.0, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.1423951957500866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.013308429101016372, + "learning_rate": 5.535999999999999e-06, + "loss": 0.0005, + "num_tokens": 28564780.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 253.1875, + "completions/mean_terminated_length": 253.1875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.14251068252685067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.01009265033644624, + "learning_rate": 5.534e-06, + "loss": 0.0004, + "num_tokens": 28589042.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1119.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 482.84375, + "completions/mean_terminated_length": 482.84375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.14262616930361474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.011700005881721154, + "learning_rate": 5.532e-06, + "loss": 0.0005, + "num_tokens": 28615021.0, + "reward": 3.7889816761016846, + "reward_std": 0.6665822863578796, + "rewards/reward_fn/mean": 3.7889816761016846, + "rewards/reward_fn/std": 0.6665822863578796, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 223.875, + "completions/mean_terminated_length": 223.875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.1427416560803788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.020305315440054983, + "learning_rate": 5.53e-06, + "loss": 0.0008, + "num_tokens": 28644489.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 215.65625, + "completions/mean_terminated_length": 215.65625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.14285714285714285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.011160176363773644, + "learning_rate": 5.527999999999999e-06, + "loss": 0.0004, + "num_tokens": 28668222.0, + "reward": 3.251462459564209, + "reward_std": 0.408997505903244, + "rewards/reward_fn/mean": 3.251462459564209, + "rewards/reward_fn/std": 0.4089975357055664, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 324.5625, + "completions/mean_terminated_length": 324.5625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.14297262963390692, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.010823504489962943, + "learning_rate": 5.525999999999999e-06, + "loss": 0.0004, + "num_tokens": 28702992.0, + "reward": 3.5895895957946777, + "reward_std": 0.6535202264785767, + "rewards/reward_fn/mean": 3.5895895957946777, + "rewards/reward_fn/std": 0.6535202860832214, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 208.625, + "completions/mean_terminated_length": 208.625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.14308811641067098, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.009616549767088145, + "learning_rate": 5.524e-06, + "loss": 0.0004, + "num_tokens": 28734436.0, + "reward": 3.0872201919555664, + "reward_std": 0.6420595049858093, + "rewards/reward_fn/mean": 3.0872201919555664, + "rewards/reward_fn/std": 0.6420594453811646, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 138.5625, + "completions/mean_terminated_length": 138.5625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.14320360318743505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.008642053537187167, + "learning_rate": 5.522e-06, + "loss": 0.0003, + "num_tokens": 28750710.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 228.21875, + "completions/mean_terminated_length": 228.21875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.1433190899641991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.009842003477388062, + "learning_rate": 5.52e-06, + "loss": 0.0004, + "num_tokens": 28773277.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 249.65625, + "completions/mean_terminated_length": 249.65625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.14343457674096316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.014578653819626197, + "learning_rate": 5.5179999999999995e-06, + "loss": 0.0006, + "num_tokens": 28801490.0, + "reward": 3.352717876434326, + "reward_std": 0.8217841386795044, + "rewards/reward_fn/mean": 3.352717876434326, + "rewards/reward_fn/std": 0.8217841386795044, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 118.5625, + "completions/mean_terminated_length": 118.5625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.14355006351772723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.008402690877119312, + "learning_rate": 5.515999999999999e-06, + "loss": 0.0003, + "num_tokens": 28817700.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 254.75, + "completions/mean_terminated_length": 254.75, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.14366555029449127, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.013130235092830844, + "learning_rate": 5.514e-06, + "loss": 0.0005, + "num_tokens": 28837852.0, + "reward": 3.932821035385132, + "reward_std": 0.38002142310142517, + "rewards/reward_fn/mean": 3.932821035385132, + "rewards/reward_fn/std": 0.38002142310142517, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 130.3125, + "completions/mean_terminated_length": 130.3125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.14378103707125534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.020628446363843977, + "learning_rate": 5.511999999999999e-06, + "loss": 0.0008, + "num_tokens": 28869958.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 125.8125, + "completions/mean_terminated_length": 125.8125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.1438965238480194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.012761988713464234, + "learning_rate": 5.51e-06, + "loss": 0.0005, + "num_tokens": 28896704.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 88.40625, + "completions/mean_terminated_length": 88.40625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.14401201062478347, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015625, + "kl": 0.016427883514552377, + "learning_rate": 5.508e-06, + "loss": 0.0007, + "num_tokens": 28924109.0, + "reward": 3.864248275756836, + "reward_std": 0.5341769456863403, + "rewards/reward_fn/mean": 3.864248275756836, + "rewards/reward_fn/std": 0.5341768860816956, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 68.46875, + "completions/mean_terminated_length": 68.46875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.1441274974015475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.40234375, + "kl": 0.02413847557909321, + "learning_rate": 5.506e-06, + "loss": 0.001, + "num_tokens": 28955484.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 115.125, + "completions/mean_terminated_length": 115.125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.14424298417831158, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.009908517604344524, + "learning_rate": 5.503999999999999e-06, + "loss": 0.0004, + "num_tokens": 28972832.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 200.40625, + "completions/mean_terminated_length": 200.40625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.14435847095507565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044921875, + "kl": 0.008199832198442891, + "learning_rate": 5.501999999999999e-06, + "loss": 0.0003, + "num_tokens": 29001613.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 296.75, + "completions/mean_terminated_length": 296.75, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.14447395773183971, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.018968315634992905, + "learning_rate": 5.5e-06, + "loss": 0.0008, + "num_tokens": 29030341.0, + "reward": 3.226468324661255, + "reward_std": 0.47065797448158264, + "rewards/reward_fn/mean": 3.226468324661255, + "rewards/reward_fn/std": 0.47065797448158264, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 229.84375, + "completions/mean_terminated_length": 229.84375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.14458944450860375, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.01641292827844154, + "learning_rate": 5.498e-06, + "loss": 0.0007, + "num_tokens": 29051808.0, + "reward": 3.475940704345703, + "reward_std": 0.8325286507606506, + "rewards/reward_fn/mean": 3.475940704345703, + "rewards/reward_fn/std": 0.8325286507606506, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 389.25, + "completions/mean_terminated_length": 389.25, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.14470493128536782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.014252955472329631, + "learning_rate": 5.496e-06, + "loss": 0.0006, + "num_tokens": 29079272.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 257.40625, + "completions/mean_terminated_length": 257.40625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.1448204180621319, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.01095535160129657, + "learning_rate": 5.493999999999999e-06, + "loss": 0.0004, + "num_tokens": 29095733.0, + "reward": 3.300767183303833, + "reward_std": 0.4997071921825409, + "rewards/reward_fn/mean": 3.300767183303833, + "rewards/reward_fn/std": 0.4997071623802185, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 166.875, + "completions/mean_terminated_length": 166.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.14493590483889596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.00758222934382502, + "learning_rate": 5.492e-06, + "loss": 0.0003, + "num_tokens": 29112145.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 93.9375, + "completions/mean_terminated_length": 93.9375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.14505139161566, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.013598211349744815, + "learning_rate": 5.49e-06, + "loss": 0.0005, + "num_tokens": 29139983.0, + "reward": 3.455019474029541, + "reward_std": 0.02516379952430725, + "rewards/reward_fn/mean": 3.455019474029541, + "rewards/reward_fn/std": 0.025163762271404266, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 119.03125, + "completions/mean_terminated_length": 119.03125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.14516687839242406, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.65625, + "kl": 0.010024185168731492, + "learning_rate": 5.488e-06, + "loss": 0.0004, + "num_tokens": 29169424.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 252.78125, + "completions/mean_terminated_length": 252.78125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.14528236516918813, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.009634667221689597, + "learning_rate": 5.486e-06, + "loss": 0.0004, + "num_tokens": 29197161.0, + "reward": 2.73622989654541, + "reward_std": 0.03561216592788696, + "rewards/reward_fn/mean": 2.73622989654541, + "rewards/reward_fn/std": 0.03561217710375786, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 303.15625, + "completions/mean_terminated_length": 303.15625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.1453978519459522, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.01113803828775417, + "learning_rate": 5.4839999999999995e-06, + "loss": 0.0004, + "num_tokens": 29221870.0, + "reward": 3.673279047012329, + "reward_std": 0.49385932087898254, + "rewards/reward_fn/mean": 3.673279047012329, + "rewards/reward_fn/std": 0.49385932087898254, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 98.90625, + "completions/mean_terminated_length": 98.90625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.14551333872271624, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.609375, + "kl": 0.010723165367380716, + "learning_rate": 5.482e-06, + "loss": 0.0004, + "num_tokens": 29237163.0, + "reward": 3.961958169937134, + "reward_std": 0.21519741415977478, + "rewards/reward_fn/mean": 3.961958169937134, + "rewards/reward_fn/std": 0.21519742906093597, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 188.71875, + "completions/mean_terminated_length": 188.71875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.1456288254994803, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.013625773441162892, + "learning_rate": 5.48e-06, + "loss": 0.0005, + "num_tokens": 29261986.0, + "reward": 3.6185317039489746, + "reward_std": 0.4742244780063629, + "rewards/reward_fn/mean": 3.6185317039489746, + "rewards/reward_fn/std": 0.4742245078086853, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 79.0, + "completions/max_terminated_length": 79.0, + "completions/mean_length": 52.3125, + "completions/mean_terminated_length": 52.3125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.14574431227624438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1669921875, + "kl": 0.014210689856554382, + "learning_rate": 5.478e-06, + "loss": 0.0006, + "num_tokens": 29284748.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 291.96875, + "completions/mean_terminated_length": 291.96875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.14585979905300844, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.01586004756973125, + "learning_rate": 5.476e-06, + "loss": 0.0006, + "num_tokens": 29316971.0, + "reward": 3.2351999282836914, + "reward_std": 0.41108572483062744, + "rewards/reward_fn/mean": 3.2351999282836914, + "rewards/reward_fn/std": 0.41108569502830505, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 152.875, + "completions/mean_terminated_length": 152.875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.14597528582977248, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.010935255908407271, + "learning_rate": 5.474e-06, + "loss": 0.0004, + "num_tokens": 29345319.0, + "reward": 2.890829086303711, + "reward_std": 0.05074574798345566, + "rewards/reward_fn/mean": 2.890829086303711, + "rewards/reward_fn/std": 0.050745755434036255, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 86.53125, + "completions/mean_terminated_length": 86.53125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.14609077260653655, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.00864216178888455, + "learning_rate": 5.472e-06, + "loss": 0.0003, + "num_tokens": 29357528.0, + "reward": 3.9740710258483887, + "reward_std": 0.14667697250843048, + "rewards/reward_fn/mean": 3.9740710258483887, + "rewards/reward_fn/std": 0.14667697250843048, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 75.875, + "completions/mean_terminated_length": 75.875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.14620625938330062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.010154147326829843, + "learning_rate": 5.469999999999999e-06, + "loss": 0.0004, + "num_tokens": 29373300.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 113.3125, + "completions/mean_terminated_length": 113.3125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.1463217461600647, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.009592994538252242, + "learning_rate": 5.468e-06, + "loss": 0.0004, + "num_tokens": 29399358.0, + "reward": 3.3357865810394287, + "reward_std": 0.026912033557891846, + "rewards/reward_fn/mean": 3.3357865810394287, + "rewards/reward_fn/std": 0.02691204845905304, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 292.09375, + "completions/mean_terminated_length": 292.09375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.14643723293682873, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.009095020956010558, + "learning_rate": 5.466e-06, + "loss": 0.0004, + "num_tokens": 29427329.0, + "reward": 3.864032030105591, + "reward_std": 0.47601115703582764, + "rewards/reward_fn/mean": 3.864032030105591, + "rewards/reward_fn/std": 0.47601118683815, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 304.875, + "completions/mean_terminated_length": 304.875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.1465527197135928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.0174492259247927, + "learning_rate": 5.4640000000000005e-06, + "loss": 0.0007, + "num_tokens": 29457341.0, + "reward": 3.589630365371704, + "reward_std": 0.6264307498931885, + "rewards/reward_fn/mean": 3.589630365371704, + "rewards/reward_fn/std": 0.6264306902885437, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 187.125, + "completions/mean_terminated_length": 187.125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.14666820649035686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.010209855499851983, + "learning_rate": 5.4619999999999995e-06, + "loss": 0.0004, + "num_tokens": 29477473.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 258.3125, + "completions/mean_terminated_length": 200.5806427001953, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.1467836932671209, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.57421875, + "kl": 0.019635517353890464, + "learning_rate": 5.459999999999999e-06, + "loss": 0.0008, + "num_tokens": 29496651.0, + "reward": 2.8650307655334473, + "reward_std": 0.523710310459137, + "rewards/reward_fn/mean": 2.8650307655334473, + "rewards/reward_fn/std": 0.523710310459137, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 193.53125, + "completions/mean_terminated_length": 193.53125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.14689918004388497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.012488326246966608, + "learning_rate": 5.458e-06, + "loss": 0.0005, + "num_tokens": 29513404.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 219.875, + "completions/mean_terminated_length": 219.875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.14701466682064904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.009446080002817325, + "learning_rate": 5.456e-06, + "loss": 0.0004, + "num_tokens": 29538072.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 181.625, + "completions/mean_terminated_length": 181.625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.1471301535974131, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.009712496292195283, + "learning_rate": 5.454e-06, + "loss": 0.0004, + "num_tokens": 29556332.0, + "reward": 3.864168882369995, + "reward_std": 0.5344967246055603, + "rewards/reward_fn/mean": 3.864168882369995, + "rewards/reward_fn/std": 0.5344967246055603, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 296.53125, + "completions/mean_terminated_length": 296.53125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.14724564037417714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.01206866407301277, + "learning_rate": 5.452e-06, + "loss": 0.0005, + "num_tokens": 29579837.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 159.90625, + "completions/mean_terminated_length": 159.90625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1473611271509412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.015999259776435792, + "learning_rate": 5.45e-06, + "loss": 0.0006, + "num_tokens": 29596794.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 223.46875, + "completions/mean_terminated_length": 223.46875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.14747661392770528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.013357341929804534, + "learning_rate": 5.448e-06, + "loss": 0.0005, + "num_tokens": 29619913.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 211.15625, + "completions/mean_terminated_length": 211.15625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.14759210070446935, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.019492482548230328, + "learning_rate": 5.445999999999999e-06, + "loss": 0.0008, + "num_tokens": 29651246.0, + "reward": 3.9321846961975098, + "reward_std": 0.22035369277000427, + "rewards/reward_fn/mean": 3.9321846961975098, + "rewards/reward_fn/std": 0.22035366296768188, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 176.65625, + "completions/mean_terminated_length": 176.65625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.1477075874812334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.01253172401629854, + "learning_rate": 5.444e-06, + "loss": 0.0005, + "num_tokens": 29668867.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 235.78125, + "completions/mean_terminated_length": 235.78125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.14782307425799746, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.018298882408998907, + "learning_rate": 5.442e-06, + "loss": 0.0007, + "num_tokens": 29697148.0, + "reward": 3.8370537757873535, + "reward_std": 0.3460940420627594, + "rewards/reward_fn/mean": 3.8370537757873535, + "rewards/reward_fn/std": 0.3460940718650818, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 92.28125, + "completions/mean_terminated_length": 92.28125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.14793856103476152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796875, + "kl": 0.022986386436969042, + "learning_rate": 5.4400000000000004e-06, + "loss": 0.0009, + "num_tokens": 29717253.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 89.5, + "completions/mean_terminated_length": 89.5, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.1480540478115256, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.014939023123588413, + "learning_rate": 5.4379999999999994e-06, + "loss": 0.0006, + "num_tokens": 29732341.0, + "reward": 3.928985118865967, + "reward_std": 0.4017207622528076, + "rewards/reward_fn/mean": 3.928985118865967, + "rewards/reward_fn/std": 0.40172073245048523, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 249.09375, + "completions/mean_terminated_length": 249.09375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.14816953458828963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.009995119355153292, + "learning_rate": 5.435999999999999e-06, + "loss": 0.0004, + "num_tokens": 29755544.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 94.59375, + "completions/mean_terminated_length": 94.59375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.1482850213650537, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "kl": 0.015638147306162864, + "learning_rate": 5.434e-06, + "loss": 0.0006, + "num_tokens": 29776171.0, + "reward": 3.9758450984954834, + "reward_std": 0.13664044439792633, + "rewards/reward_fn/mean": 3.9758450984954834, + "rewards/reward_fn/std": 0.13664045929908752, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 73.25, + "completions/mean_terminated_length": 73.25, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.14840050814181777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.011481325193017256, + "learning_rate": 5.432e-06, + "loss": 0.0005, + "num_tokens": 29801971.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1214.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 284.6875, + "completions/mean_terminated_length": 284.6875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.14851599491858183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05126953125, + "kl": 0.011724568845238537, + "learning_rate": 5.43e-06, + "loss": 0.0005, + "num_tokens": 29823721.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 275.5, + "completions/mean_terminated_length": 275.5, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.14863148169534587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.011853297866764478, + "learning_rate": 5.4279999999999995e-06, + "loss": 0.0005, + "num_tokens": 29848537.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 113.28125, + "completions/mean_terminated_length": 113.28125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.14874696847210994, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.011451665493950713, + "learning_rate": 5.426e-06, + "loss": 0.0005, + "num_tokens": 29862498.0, + "reward": 3.9656894207000732, + "reward_std": 0.1350104957818985, + "rewards/reward_fn/mean": 3.9656894207000732, + "rewards/reward_fn/std": 0.1350104957818985, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 258.96875, + "completions/mean_terminated_length": 258.96875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.148862455248874, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.01238767959875986, + "learning_rate": 5.424e-06, + "loss": 0.0005, + "num_tokens": 29884961.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 198.40625, + "completions/mean_terminated_length": 198.40625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.14897794202563808, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.013865499189705588, + "learning_rate": 5.421999999999999e-06, + "loss": 0.0006, + "num_tokens": 29913486.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 135.0625, + "completions/mean_terminated_length": 135.0625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.14909342880240212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.010499922784219962, + "learning_rate": 5.42e-06, + "loss": 0.0004, + "num_tokens": 29941232.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 135.78125, + "completions/mean_terminated_length": 135.78125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.14920891557916618, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.014579168215277605, + "learning_rate": 5.418e-06, + "loss": 0.0006, + "num_tokens": 29970953.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 228.78125, + "completions/mean_terminated_length": 228.78125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.14932440235593025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.008899642212782055, + "learning_rate": 5.416e-06, + "loss": 0.0004, + "num_tokens": 29993538.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 79.46875, + "completions/mean_terminated_length": 79.46875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.14943988913269432, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.010271038743667305, + "learning_rate": 5.413999999999999e-06, + "loss": 0.0004, + "num_tokens": 30029329.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 290.3125, + "completions/mean_terminated_length": 290.3125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.14955537590945836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.01419490204716567, + "learning_rate": 5.412e-06, + "loss": 0.0006, + "num_tokens": 30057947.0, + "reward": 3.4065439701080322, + "reward_std": 0.8498892784118652, + "rewards/reward_fn/mean": 3.4065439701080322, + "rewards/reward_fn/std": 0.8498892784118652, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 184.71875, + "completions/mean_terminated_length": 184.71875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.14967086268622243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.005989771991153248, + "learning_rate": 5.41e-06, + "loss": 0.0002, + "num_tokens": 30076210.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 180.1875, + "completions/mean_terminated_length": 180.1875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.1497863494629865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.01133689159178175, + "learning_rate": 5.408e-06, + "loss": 0.0005, + "num_tokens": 30094424.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 257.25, + "completions/mean_terminated_length": 257.25, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.14990183623975054, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.013221929664723575, + "learning_rate": 5.4059999999999996e-06, + "loss": 0.0005, + "num_tokens": 30119616.0, + "reward": 3.838925361633301, + "reward_std": 0.4819112718105316, + "rewards/reward_fn/mean": 3.838925361633301, + "rewards/reward_fn/std": 0.481911301612854, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 120.15625, + "completions/mean_terminated_length": 120.15625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.1500173230165146, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.010343138936150353, + "learning_rate": 5.403999999999999e-06, + "loss": 0.0004, + "num_tokens": 30148389.0, + "reward": 3.86480712890625, + "reward_std": 0.5320358276367188, + "rewards/reward_fn/mean": 3.86480712890625, + "rewards/reward_fn/std": 0.5320358276367188, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 196.34375, + "completions/mean_terminated_length": 196.34375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.15013280979327867, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.01619694076362066, + "learning_rate": 5.402e-06, + "loss": 0.0006, + "num_tokens": 30177936.0, + "reward": 3.909064769744873, + "reward_std": 0.40901097655296326, + "rewards/reward_fn/mean": 3.909064769744873, + "rewards/reward_fn/std": 0.40901097655296326, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 409.34375, + "completions/mean_terminated_length": 409.34375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.15024829657004274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.014577579131582752, + "learning_rate": 5.4e-06, + "loss": 0.0006, + "num_tokens": 30208667.0, + "reward": 2.975217580795288, + "reward_std": 0.8177131414413452, + "rewards/reward_fn/mean": 2.975217580795288, + "rewards/reward_fn/std": 0.8177131414413452, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 194.09375, + "completions/mean_terminated_length": 194.09375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.15036378334680678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458984375, + "kl": 0.007764260066323914, + "learning_rate": 5.398e-06, + "loss": 0.0003, + "num_tokens": 30241342.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 204.125, + "completions/mean_terminated_length": 204.125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.15047927012357085, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.010365139809437096, + "learning_rate": 5.396e-06, + "loss": 0.0004, + "num_tokens": 30264898.0, + "reward": 3.883193016052246, + "reward_std": 0.4284785985946655, + "rewards/reward_fn/mean": 3.883193016052246, + "rewards/reward_fn/std": 0.42847853899002075, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 96.15625, + "completions/mean_terminated_length": 96.15625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.15059475690033491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.008825256387353875, + "learning_rate": 5.3939999999999995e-06, + "loss": 0.0004, + "num_tokens": 30280743.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 300.90625, + "completions/mean_terminated_length": 300.90625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.15071024367709898, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.012821565658668987, + "learning_rate": 5.392e-06, + "loss": 0.0005, + "num_tokens": 30299396.0, + "reward": 2.805809736251831, + "reward_std": 0.2740302085876465, + "rewards/reward_fn/mean": 2.805809736251831, + "rewards/reward_fn/std": 0.2740302085876465, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 388.6875, + "completions/mean_terminated_length": 388.6875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.15082573045386302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.008236529014538974, + "learning_rate": 5.389999999999999e-06, + "loss": 0.0003, + "num_tokens": 30335066.0, + "reward": 3.7499399185180664, + "reward_std": 0.8214887380599976, + "rewards/reward_fn/mean": 3.7499399185180664, + "rewards/reward_fn/std": 0.8214886784553528, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 340.25, + "completions/mean_terminated_length": 340.25, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.1509412172306271, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.012288401630939916, + "learning_rate": 5.388e-06, + "loss": 0.0005, + "num_tokens": 30365826.0, + "reward": 2.5913662910461426, + "reward_std": 0.37430649995803833, + "rewards/reward_fn/mean": 2.5913662910461426, + "rewards/reward_fn/std": 0.3743065297603607, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 88.0, + "completions/mean_terminated_length": 88.0, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.15105670400739116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033935546875, + "kl": 0.0030655439222755376, + "learning_rate": 5.386e-06, + "loss": 0.0001, + "num_tokens": 30382786.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 122.0, + "completions/mean_terminated_length": 122.0, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.15117219078415522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.011715907487086952, + "learning_rate": 5.3840000000000005e-06, + "loss": 0.0005, + "num_tokens": 30404898.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 156.125, + "completions/mean_terminated_length": 156.125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.15128767756091926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.01276158686960116, + "learning_rate": 5.3819999999999995e-06, + "loss": 0.0005, + "num_tokens": 30428422.0, + "reward": 3.732957601547241, + "reward_std": 0.5256021618843079, + "rewards/reward_fn/mean": 3.732957601547241, + "rewards/reward_fn/std": 0.5256021618843079, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 82.75, + "completions/mean_terminated_length": 82.75, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.15140316433768333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.008691215509315953, + "learning_rate": 5.379999999999999e-06, + "loss": 0.0003, + "num_tokens": 30456126.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 91.25, + "completions/mean_terminated_length": 91.25, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.1515186511144474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.006193318986333907, + "learning_rate": 5.378e-06, + "loss": 0.0002, + "num_tokens": 30479686.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 452.625, + "completions/mean_terminated_length": 401.1612854003906, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.15163413789121147, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8046875, + "kl": 0.011599238583585247, + "learning_rate": 5.376e-06, + "loss": 0.0005, + "num_tokens": 30519034.0, + "reward": 2.8127758502960205, + "reward_std": 0.5877657532691956, + "rewards/reward_fn/mean": 2.8127758502960205, + "rewards/reward_fn/std": 0.5877656936645508, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 107.4375, + "completions/mean_terminated_length": 107.4375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.1517496246679755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.010344611320761032, + "learning_rate": 5.374e-06, + "loss": 0.0004, + "num_tokens": 30534632.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 311.53125, + "completions/mean_terminated_length": 311.53125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.15186511144473958, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.015113259592908435, + "learning_rate": 5.3719999999999996e-06, + "loss": 0.0006, + "num_tokens": 30563929.0, + "reward": 2.535259485244751, + "reward_std": 0.9343302249908447, + "rewards/reward_fn/mean": 2.535259485244751, + "rewards/reward_fn/std": 0.9343302249908447, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 322.09375, + "completions/mean_terminated_length": 322.09375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.15198059822150364, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "kl": 0.015757466084323823, + "learning_rate": 5.37e-06, + "loss": 0.0006, + "num_tokens": 30587164.0, + "reward": 3.862279176712036, + "reward_std": 0.5419428944587708, + "rewards/reward_fn/mean": 3.862279176712036, + "rewards/reward_fn/std": 0.541942834854126, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 88.6875, + "completions/mean_terminated_length": 88.6875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.1520960849982677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.019123610836686566, + "learning_rate": 5.368e-06, + "loss": 0.0008, + "num_tokens": 30606706.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 106.1875, + "completions/mean_terminated_length": 106.1875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.15221157177503175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.011948785439017229, + "learning_rate": 5.365999999999999e-06, + "loss": 0.0005, + "num_tokens": 30629432.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 95.125, + "completions/mean_terminated_length": 95.125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.15232705855179582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.009171925543341786, + "learning_rate": 5.364e-06, + "loss": 0.0004, + "num_tokens": 30649788.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 83.0, + "completions/mean_terminated_length": 83.0, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.1524425453285599, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.010846418233995792, + "learning_rate": 5.362e-06, + "loss": 0.0004, + "num_tokens": 30663196.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 208.8125, + "completions/mean_terminated_length": 208.8125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.15255803210532395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.009155614548944868, + "learning_rate": 5.36e-06, + "loss": 0.0004, + "num_tokens": 30688534.0, + "reward": 3.8581295013427734, + "reward_std": 0.5583838820457458, + "rewards/reward_fn/mean": 3.8581295013427734, + "rewards/reward_fn/std": 0.5583838820457458, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 211.25, + "completions/mean_terminated_length": 211.25, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.152673518882088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.012160110301920213, + "learning_rate": 5.357999999999999e-06, + "loss": 0.0005, + "num_tokens": 30707774.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 110.1875, + "completions/mean_terminated_length": 110.1875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.15278900565885206, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.009002934144518804, + "learning_rate": 5.356e-06, + "loss": 0.0004, + "num_tokens": 30729508.0, + "reward": 3.988844394683838, + "reward_std": 0.0631064847111702, + "rewards/reward_fn/mean": 3.988844394683838, + "rewards/reward_fn/std": 0.06310652196407318, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 177.71875, + "completions/mean_terminated_length": 177.71875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.15290449243561613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.012773378504789434, + "learning_rate": 5.354e-06, + "loss": 0.0005, + "num_tokens": 30758139.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 343.90625, + "completions/mean_terminated_length": 343.90625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.15301997921238017, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.014451768045546487, + "learning_rate": 5.352e-06, + "loss": 0.0006, + "num_tokens": 30781848.0, + "reward": 3.789332866668701, + "reward_std": 0.6654887199401855, + "rewards/reward_fn/mean": 3.789332866668701, + "rewards/reward_fn/std": 0.6654887795448303, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 71.65625, + "completions/mean_terminated_length": 71.65625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.15313546598914424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.010145118620130233, + "learning_rate": 5.35e-06, + "loss": 0.0004, + "num_tokens": 30805325.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 78.375, + "completions/mean_terminated_length": 78.375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.1532509527659083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.005639935316139599, + "learning_rate": 5.3479999999999994e-06, + "loss": 0.0002, + "num_tokens": 30825017.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 195.1875, + "completions/mean_terminated_length": 195.1875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.15336643954267237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.01224995830853004, + "learning_rate": 5.346e-06, + "loss": 0.0005, + "num_tokens": 30844959.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 59.25, + "completions/mean_terminated_length": 59.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.1534819263194364, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.40625, + "kl": 0.02066310877853539, + "learning_rate": 5.344e-06, + "loss": 0.0008, + "num_tokens": 30859847.0, + "reward": 3.75, + "reward_std": 0.9837387204170227, + "rewards/reward_fn/mean": 3.75, + "rewards/reward_fn/std": 0.9837387204170227, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 99.09375, + "completions/mean_terminated_length": 99.09375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.15359741309620048, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1611328125, + "kl": 0.014194803399732336, + "learning_rate": 5.341999999999999e-06, + "loss": 0.0006, + "num_tokens": 30886250.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 94.90625, + "completions/mean_terminated_length": 94.90625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.15371289987296455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.011653240311716218, + "learning_rate": 5.34e-06, + "loss": 0.0005, + "num_tokens": 30901863.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 401.90625, + "completions/mean_terminated_length": 401.90625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.15382838664972862, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.009679916794993915, + "learning_rate": 5.3379999999999995e-06, + "loss": 0.0004, + "num_tokens": 30928036.0, + "reward": 3.928947925567627, + "reward_std": 0.40193161368370056, + "rewards/reward_fn/mean": 3.928947925567627, + "rewards/reward_fn/std": 0.40193164348602295, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 99.0625, + "completions/mean_terminated_length": 99.0625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.15394387342649266, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.008893439175153617, + "learning_rate": 5.336e-06, + "loss": 0.0004, + "num_tokens": 30959366.0, + "reward": 3.96535062789917, + "reward_std": 0.19600708782672882, + "rewards/reward_fn/mean": 3.96535062789917, + "rewards/reward_fn/std": 0.19600707292556763, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 241.71875, + "completions/mean_terminated_length": 241.71875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.15405936020325672, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.011535425437614322, + "learning_rate": 5.333999999999999e-06, + "loss": 0.0005, + "num_tokens": 30986525.0, + "reward": 2.941464900970459, + "reward_std": 0.31579098105430603, + "rewards/reward_fn/mean": 2.941464900970459, + "rewards/reward_fn/std": 0.31579098105430603, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 179.28125, + "completions/mean_terminated_length": 179.28125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.1541748469800208, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.008178790267265867, + "learning_rate": 5.332e-06, + "loss": 0.0003, + "num_tokens": 31005094.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 198.6875, + "completions/mean_terminated_length": 198.6875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.15429033375678486, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.008905281509214547, + "learning_rate": 5.33e-06, + "loss": 0.0004, + "num_tokens": 31033564.0, + "reward": 3.954476833343506, + "reward_std": 0.17943274974822998, + "rewards/reward_fn/mean": 3.954476833343506, + "rewards/reward_fn/std": 0.17943276464939117, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 234.0, + "completions/mean_terminated_length": 234.0, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.1544058205335489, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.01643057996989228, + "learning_rate": 5.328e-06, + "loss": 0.0007, + "num_tokens": 31064892.0, + "reward": 3.9699535369873047, + "reward_std": 0.16996899247169495, + "rewards/reward_fn/mean": 3.9699535369873047, + "rewards/reward_fn/std": 0.16996896266937256, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 287.125, + "completions/mean_terminated_length": 287.125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.15452130731031297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.011713671527104452, + "learning_rate": 5.3259999999999995e-06, + "loss": 0.0005, + "num_tokens": 31100128.0, + "reward": 3.6892189979553223, + "reward_std": 0.7589834332466125, + "rewards/reward_fn/mean": 3.6892189979553223, + "rewards/reward_fn/std": 0.7589834332466125, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 102.96875, + "completions/mean_terminated_length": 102.96875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.15463679408707703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.013860143946658354, + "learning_rate": 5.323999999999999e-06, + "loss": 0.0006, + "num_tokens": 31121215.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 94.1875, + "completions/mean_terminated_length": 94.1875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.1547522808638411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.00671150399648468, + "learning_rate": 5.322e-06, + "loss": 0.0003, + "num_tokens": 31138341.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 249.09375, + "completions/mean_terminated_length": 249.09375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.15486776764060514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.014853216824121773, + "learning_rate": 5.32e-06, + "loss": 0.0006, + "num_tokens": 31158856.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 195.84375, + "completions/mean_terminated_length": 195.84375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.1549832544173692, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.016688407471519895, + "learning_rate": 5.318e-06, + "loss": 0.0007, + "num_tokens": 31188195.0, + "reward": 3.633258819580078, + "reward_std": 0.4574331045150757, + "rewards/reward_fn/mean": 3.633258819580078, + "rewards/reward_fn/std": 0.4574331045150757, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 128.71875, + "completions/mean_terminated_length": 128.71875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.15509874119413328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.016216177464229986, + "learning_rate": 5.316e-06, + "loss": 0.0006, + "num_tokens": 31203642.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 268.46875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.15521422797089734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.010718581819674, + "learning_rate": 5.3139999999999994e-06, + "loss": 0.0004, + "num_tokens": 31221289.0, + "reward": 3.8825621604919434, + "reward_std": 0.2571810781955719, + "rewards/reward_fn/mean": 3.8825621604919434, + "rewards/reward_fn/std": 0.2571810185909271, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1601.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 624.21875, + "completions/mean_terminated_length": 624.21875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.15532971474766138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.010065782815217972, + "learning_rate": 5.312e-06, + "loss": 0.0004, + "num_tokens": 31257904.0, + "reward": 3.7647969722747803, + "reward_std": 0.7429962754249573, + "rewards/reward_fn/mean": 3.7647969722747803, + "rewards/reward_fn/std": 0.742996335029602, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 134.1875, + "completions/mean_terminated_length": 134.1875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.15544520152442545, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.109375, + "kl": 0.01983989487052895, + "learning_rate": 5.309999999999999e-06, + "loss": 0.0008, + "num_tokens": 31273462.0, + "reward": 3.144252300262451, + "reward_std": 0.22751979529857635, + "rewards/reward_fn/mean": 3.144252300262451, + "rewards/reward_fn/std": 0.22751979529857635, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 119.65625, + "completions/mean_terminated_length": 119.65625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.15556068830118952, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.140625, + "kl": 0.02107408148003742, + "learning_rate": 5.308e-06, + "loss": 0.0008, + "num_tokens": 31295467.0, + "reward": 3.932598352432251, + "reward_std": 0.3812812864780426, + "rewards/reward_fn/mean": 3.932598352432251, + "rewards/reward_fn/std": 0.3812812566757202, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 103.0, + "completions/mean_length": 78.6875, + "completions/mean_terminated_length": 78.6875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.1556761750779536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.003507909164909506, + "learning_rate": 5.306e-06, + "loss": 0.0001, + "num_tokens": 31316417.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 85.25, + "completions/mean_terminated_length": 85.25, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.15579166185471763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.008521009600372054, + "learning_rate": 5.304e-06, + "loss": 0.0003, + "num_tokens": 31331049.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 277.90625, + "completions/mean_terminated_length": 277.90625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.1559071486314817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.017743946795235388, + "learning_rate": 5.301999999999999e-06, + "loss": 0.0007, + "num_tokens": 31358310.0, + "reward": 3.1483333110809326, + "reward_std": 0.31259942054748535, + "rewards/reward_fn/mean": 3.1483333110809326, + "rewards/reward_fn/std": 0.31259942054748535, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 223.21875, + "completions/mean_terminated_length": 223.21875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.15602263540824576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03857421875, + "kl": 0.007556358985311817, + "learning_rate": 5.299999999999999e-06, + "loss": 0.0003, + "num_tokens": 31380781.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 57.9375, + "completions/mean_terminated_length": 57.9375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.1561381221850098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1865234375, + "kl": 0.018822964164428413, + "learning_rate": 5.298e-06, + "loss": 0.0008, + "num_tokens": 31404907.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 174.40625, + "completions/mean_terminated_length": 174.40625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.15625360896177387, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.011541184394445736, + "learning_rate": 5.296e-06, + "loss": 0.0005, + "num_tokens": 31432248.0, + "reward": 3.9706225395202637, + "reward_std": 0.16618315875530243, + "rewards/reward_fn/mean": 3.9706225395202637, + "rewards/reward_fn/std": 0.16618312895298004, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 243.875, + "completions/mean_terminated_length": 243.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.15636909573853794, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.010188247018959373, + "learning_rate": 5.294e-06, + "loss": 0.0004, + "num_tokens": 31450868.0, + "reward": 3.9300365447998047, + "reward_std": 0.39577388763427734, + "rewards/reward_fn/mean": 3.9300365447998047, + "rewards/reward_fn/std": 0.39577385783195496, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 133.90625, + "completions/mean_terminated_length": 133.90625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.156484582515302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.011891552523593418, + "learning_rate": 5.2919999999999995e-06, + "loss": 0.0005, + "num_tokens": 31467185.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 68.0, + "completions/mean_terminated_length": 68.0, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.15660006929206605, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.28125, + "kl": 0.007964514290506486, + "learning_rate": 5.29e-06, + "loss": 0.0003, + "num_tokens": 31481073.0, + "reward": 2.8632822036743164, + "reward_std": 0.027597155421972275, + "rewards/reward_fn/mean": 2.8632822036743164, + "rewards/reward_fn/std": 0.027597103267908096, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 108.3125, + "completions/mean_terminated_length": 108.3125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.15671555606883011, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.006276272413742845, + "learning_rate": 5.288e-06, + "loss": 0.0003, + "num_tokens": 31507963.0, + "reward": 3.6535470485687256, + "reward_std": 0.043440431356430054, + "rewards/reward_fn/mean": 3.6535470485687256, + "rewards/reward_fn/std": 0.04344039782881737, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 62.9375, + "completions/mean_terminated_length": 62.9375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.15683104284559418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.014057612308533862, + "learning_rate": 5.285999999999999e-06, + "loss": 0.0006, + "num_tokens": 31533241.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 303.46875, + "completions/mean_terminated_length": 303.46875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.15694652962235825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.011565906315809116, + "learning_rate": 5.284e-06, + "loss": 0.0005, + "num_tokens": 31555240.0, + "reward": 3.9317398071289062, + "reward_std": 0.3861381411552429, + "rewards/reward_fn/mean": 3.9317398071289062, + "rewards/reward_fn/std": 0.3861382007598877, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 149.59375, + "completions/mean_terminated_length": 149.59375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1570620163991223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.0061581862973980606, + "learning_rate": 5.2819999999999996e-06, + "loss": 0.0002, + "num_tokens": 31583003.0, + "reward": 3.9315969944000244, + "reward_std": 0.386945515871048, + "rewards/reward_fn/mean": 3.9315969944000244, + "rewards/reward_fn/std": 0.386945515871048, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 137.1875, + "completions/mean_terminated_length": 137.1875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.15717750317588636, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.013139331276761368, + "learning_rate": 5.28e-06, + "loss": 0.0005, + "num_tokens": 31605473.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 408.15625, + "completions/mean_terminated_length": 355.258056640625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.15729298995265042, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.01154541684081778, + "learning_rate": 5.277999999999999e-06, + "loss": 0.0005, + "num_tokens": 31645894.0, + "reward": 3.6103131771087646, + "reward_std": 0.9369451403617859, + "rewards/reward_fn/mean": 3.6103131771087646, + "rewards/reward_fn/std": 0.9369450807571411, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 174.28125, + "completions/mean_terminated_length": 174.28125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.1574084767294145, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.012459736492019147, + "learning_rate": 5.276e-06, + "loss": 0.0005, + "num_tokens": 31663407.0, + "reward": 3.928856134414673, + "reward_std": 0.4024508595466614, + "rewards/reward_fn/mean": 3.928856134414673, + "rewards/reward_fn/std": 0.40245091915130615, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 74.8125, + "completions/mean_terminated_length": 74.8125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.15752396350617853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.007511476036597742, + "learning_rate": 5.274e-06, + "loss": 0.0003, + "num_tokens": 31679369.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 256.34375, + "completions/mean_terminated_length": 256.34375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.1576394502829426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05126953125, + "kl": 0.009553975280141458, + "learning_rate": 5.272e-06, + "loss": 0.0004, + "num_tokens": 31703796.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 370.65625, + "completions/mean_terminated_length": 370.65625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.15775493705970667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049072265625, + "kl": 0.01098277849087026, + "learning_rate": 5.2699999999999995e-06, + "loss": 0.0004, + "num_tokens": 31725257.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 258.9375, + "completions/mean_terminated_length": 258.9375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.15787042383647074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.01157790383149404, + "learning_rate": 5.267999999999999e-06, + "loss": 0.0005, + "num_tokens": 31746343.0, + "reward": 3.928675651550293, + "reward_std": 0.40347176790237427, + "rewards/reward_fn/mean": 3.928675651550293, + "rewards/reward_fn/std": 0.4034717381000519, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 171.125, + "completions/mean_terminated_length": 171.125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.15798591061323478, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.009253128882846795, + "learning_rate": 5.266e-06, + "loss": 0.0004, + "num_tokens": 31770283.0, + "reward": 3.9725708961486816, + "reward_std": 0.1551622748374939, + "rewards/reward_fn/mean": 3.9725708961486816, + "rewards/reward_fn/std": 0.15516223013401031, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 332.0625, + "completions/mean_terminated_length": 332.0625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.15810139738999884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.012143766391091049, + "learning_rate": 5.264e-06, + "loss": 0.0005, + "num_tokens": 31796141.0, + "reward": 3.8963942527770996, + "reward_std": 0.27903059124946594, + "rewards/reward_fn/mean": 3.8963942527770996, + "rewards/reward_fn/std": 0.27903059124946594, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 95.71875, + "completions/mean_terminated_length": 95.71875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.1582168841667629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.006700867914332775, + "learning_rate": 5.261999999999999e-06, + "loss": 0.0003, + "num_tokens": 31825636.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 92.96875, + "completions/mean_terminated_length": 92.96875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.15833237094352698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.009208457471686415, + "learning_rate": 5.26e-06, + "loss": 0.0004, + "num_tokens": 31847683.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 156.25, + "completions/mean_terminated_length": 156.25, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.15844785772029102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.020226184657076374, + "learning_rate": 5.2579999999999995e-06, + "loss": 0.0008, + "num_tokens": 31870763.0, + "reward": 3.8996124267578125, + "reward_std": 0.2699999213218689, + "rewards/reward_fn/mean": 3.8996124267578125, + "rewards/reward_fn/std": 0.2699998915195465, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 149.90625, + "completions/mean_terminated_length": 149.90625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.1585633444970551, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "kl": 0.009815879340749234, + "learning_rate": 5.256e-06, + "loss": 0.0004, + "num_tokens": 31898184.0, + "reward": 2.931215286254883, + "reward_std": 0.22605293989181519, + "rewards/reward_fn/mean": 2.931215286254883, + "rewards/reward_fn/std": 0.22605293989181519, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 209.53125, + "completions/mean_terminated_length": 209.53125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.15867883127381915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.011268414556980133, + "learning_rate": 5.253999999999999e-06, + "loss": 0.0005, + "num_tokens": 31927897.0, + "reward": 3.9298787117004395, + "reward_std": 0.3966652750968933, + "rewards/reward_fn/mean": 3.9298787117004395, + "rewards/reward_fn/std": 0.3966653048992157, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 118.46875, + "completions/mean_terminated_length": 118.46875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.15879431805058322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.007469220965504064, + "learning_rate": 5.252e-06, + "loss": 0.0003, + "num_tokens": 31955496.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 211.625, + "completions/mean_terminated_length": 211.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.15890980482734726, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.020714817248517647, + "learning_rate": 5.25e-06, + "loss": 0.0008, + "num_tokens": 31982492.0, + "reward": 3.856450319290161, + "reward_std": 0.30699676275253296, + "rewards/reward_fn/mean": 3.856450319290161, + "rewards/reward_fn/std": 0.30699682235717773, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 174.3125, + "completions/mean_terminated_length": 174.3125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.15902529160411133, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.011310997244436294, + "learning_rate": 5.248e-06, + "loss": 0.0005, + "num_tokens": 32003942.0, + "reward": 3.036101818084717, + "reward_std": 0.039555009454488754, + "rewards/reward_fn/mean": 3.036101818084717, + "rewards/reward_fn/std": 0.03955504298210144, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.1591407783808754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.0127674426039448, + "learning_rate": 5.246e-06, + "loss": 0.0005, + "num_tokens": 32024524.0, + "reward": 3.2712931632995605, + "reward_std": 1.0983104705810547, + "rewards/reward_fn/mean": 3.2712931632995605, + "rewards/reward_fn/std": 1.0983104705810547, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 202.625, + "completions/mean_terminated_length": 202.625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.15925626515763944, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.009895095055981074, + "learning_rate": 5.243999999999999e-06, + "loss": 0.0004, + "num_tokens": 32049664.0, + "reward": 3.8700966835021973, + "reward_std": 0.350460946559906, + "rewards/reward_fn/mean": 3.8700966835021973, + "rewards/reward_fn/std": 0.350460946559906, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 80.96875, + "completions/mean_terminated_length": 80.96875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.1593717519344035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.007573425675218459, + "learning_rate": 5.242e-06, + "loss": 0.0003, + "num_tokens": 32071263.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 182.59375, + "completions/mean_terminated_length": 182.59375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.15948723871116757, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.01241928331728559, + "learning_rate": 5.24e-06, + "loss": 0.0005, + "num_tokens": 32095186.0, + "reward": 3.879702568054199, + "reward_std": 0.2565233111381531, + "rewards/reward_fn/mean": 3.879702568054199, + "rewards/reward_fn/std": 0.25652334094047546, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 120.3125, + "completions/mean_terminated_length": 120.3125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.15960272548793164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0546875, + "kl": 0.006437986956370878, + "learning_rate": 5.2380000000000005e-06, + "loss": 0.0003, + "num_tokens": 32125212.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 137.15625, + "completions/mean_terminated_length": 137.15625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.15971821226469568, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.022444460046244785, + "learning_rate": 5.2359999999999995e-06, + "loss": 0.0009, + "num_tokens": 32147553.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 81.0625, + "completions/mean_terminated_length": 81.0625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.15983369904145975, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.140625, + "kl": 0.017129409970948473, + "learning_rate": 5.233999999999999e-06, + "loss": 0.0007, + "num_tokens": 32158211.0, + "reward": 3.4079599380493164, + "reward_std": 0.11332877725362778, + "rewards/reward_fn/mean": 3.4079599380493164, + "rewards/reward_fn/std": 0.11332882940769196, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 81.03125, + "completions/mean_terminated_length": 81.03125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.15994918581822382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.017982439196202904, + "learning_rate": 5.232e-06, + "loss": 0.0007, + "num_tokens": 32174692.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 140.4375, + "completions/mean_terminated_length": 140.4375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.16006467259498788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.014421165396925062, + "learning_rate": 5.23e-06, + "loss": 0.0006, + "num_tokens": 32197714.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 190.78125, + "completions/mean_terminated_length": 190.78125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.16018015937175192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.012715543562080711, + "learning_rate": 5.228e-06, + "loss": 0.0005, + "num_tokens": 32224843.0, + "reward": 3.1846368312835693, + "reward_std": 0.5718659162521362, + "rewards/reward_fn/mean": 3.1846368312835693, + "rewards/reward_fn/std": 0.5718659162521362, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 106.21875, + "completions/mean_terminated_length": 106.21875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.160295646148516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1884765625, + "kl": 0.019495920365443453, + "learning_rate": 5.226e-06, + "loss": 0.0008, + "num_tokens": 32247474.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 290.5625, + "completions/mean_terminated_length": 290.5625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.16041113292528006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048583984375, + "kl": 0.009880308265564963, + "learning_rate": 5.224e-06, + "loss": 0.0004, + "num_tokens": 32271908.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 92.84375, + "completions/mean_terminated_length": 92.84375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.16052661970204413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.010594019637210295, + "learning_rate": 5.222e-06, + "loss": 0.0004, + "num_tokens": 32293279.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 213.5625, + "completions/mean_terminated_length": 213.5625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.16064210647880817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.018040700451820157, + "learning_rate": 5.219999999999999e-06, + "loss": 0.0007, + "num_tokens": 32329233.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 65.46875, + "completions/mean_terminated_length": 65.46875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.16075759325557223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.010845176773727871, + "learning_rate": 5.218e-06, + "loss": 0.0004, + "num_tokens": 32347392.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 296.1875, + "completions/mean_terminated_length": 296.1875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.1608730800323363, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.015795018145581707, + "learning_rate": 5.216e-06, + "loss": 0.0006, + "num_tokens": 32364966.0, + "reward": 2.809316873550415, + "reward_std": 0.46879756450653076, + "rewards/reward_fn/mean": 2.809316873550415, + "rewards/reward_fn/std": 0.468797504901886, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 201.53125, + "completions/mean_terminated_length": 201.53125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.16098856680910037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.01633756661613006, + "learning_rate": 5.214e-06, + "loss": 0.0007, + "num_tokens": 32389783.0, + "reward": 3.7052664756774902, + "reward_std": 0.5343126654624939, + "rewards/reward_fn/mean": 3.7052664756774902, + "rewards/reward_fn/std": 0.5343126654624939, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 105.6875, + "completions/mean_terminated_length": 105.6875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.1611040535858644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.00879203531803796, + "learning_rate": 5.211999999999999e-06, + "loss": 0.0004, + "num_tokens": 32407533.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 192.375, + "completions/mean_terminated_length": 192.375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.16121954036262848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.01452598100877367, + "learning_rate": 5.21e-06, + "loss": 0.0006, + "num_tokens": 32425625.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 122.375, + "completions/mean_terminated_length": 122.375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.16133502713939255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.009700360576971434, + "learning_rate": 5.208e-06, + "loss": 0.0004, + "num_tokens": 32445573.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 269.90625, + "completions/mean_terminated_length": 269.90625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.1614505139161566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.02472810511244461, + "learning_rate": 5.206e-06, + "loss": 0.001, + "num_tokens": 32473154.0, + "reward": 3.198391914367676, + "reward_std": 0.9190080165863037, + "rewards/reward_fn/mean": 3.198391914367676, + "rewards/reward_fn/std": 0.9190080761909485, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 277.5625, + "completions/mean_terminated_length": 277.5625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.16156600069292065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.009690406659501605, + "learning_rate": 5.204e-06, + "loss": 0.0004, + "num_tokens": 32491220.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 99.40625, + "completions/mean_terminated_length": 99.40625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.16168148746968472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.00891672661236953, + "learning_rate": 5.2019999999999995e-06, + "loss": 0.0004, + "num_tokens": 32512097.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.0, + "completions/max_terminated_length": 100.0, + "completions/mean_length": 65.65625, + "completions/mean_terminated_length": 65.65625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.1617969742464488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.009142767878074665, + "learning_rate": 5.2e-06, + "loss": 0.0004, + "num_tokens": 32533270.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 181.125, + "completions/mean_terminated_length": 181.125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.16191246102321286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.01976990094408393, + "learning_rate": 5.198e-06, + "loss": 0.0008, + "num_tokens": 32561850.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 135.4375, + "completions/mean_terminated_length": 135.4375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.1620279477999769, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "kl": 0.022819354431703687, + "learning_rate": 5.196e-06, + "loss": 0.0009, + "num_tokens": 32584328.0, + "reward": 3.807553291320801, + "reward_std": 0.37660402059555054, + "rewards/reward_fn/mean": 3.807553291320801, + "rewards/reward_fn/std": 0.37660402059555054, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 111.4375, + "completions/mean_terminated_length": 111.4375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.16214343457674096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.014295994173153304, + "learning_rate": 5.194e-06, + "loss": 0.0006, + "num_tokens": 32610806.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 253.375, + "completions/mean_terminated_length": 253.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.16225892135350503, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.013198353932239115, + "learning_rate": 5.192e-06, + "loss": 0.0005, + "num_tokens": 32637058.0, + "reward": 3.899843215942383, + "reward_std": 0.3306277394294739, + "rewards/reward_fn/mean": 3.899843215942383, + "rewards/reward_fn/std": 0.3306277096271515, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 215.25, + "completions/mean_terminated_length": 215.25, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.16237440813026907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.008127527784381527, + "learning_rate": 5.19e-06, + "loss": 0.0003, + "num_tokens": 32668362.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 96.71875, + "completions/mean_terminated_length": 96.71875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.16248989490703314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.01319352921564132, + "learning_rate": 5.187999999999999e-06, + "loss": 0.0005, + "num_tokens": 32690465.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 246.75, + "completions/mean_terminated_length": 246.75, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.1626053816837972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.01048923612688668, + "learning_rate": 5.186e-06, + "loss": 0.0004, + "num_tokens": 32710361.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 341.0, + "completions/mean_terminated_length": 341.0, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.16272086846056127, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.01253171895223204, + "learning_rate": 5.184e-06, + "loss": 0.0005, + "num_tokens": 32740921.0, + "reward": 2.717700719833374, + "reward_std": 0.029394278302788734, + "rewards/reward_fn/mean": 2.717700719833374, + "rewards/reward_fn/std": 0.029394308105111122, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 138.21875, + "completions/mean_terminated_length": 138.21875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.16283635523732531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.008404802407312673, + "learning_rate": 5.1820000000000005e-06, + "loss": 0.0003, + "num_tokens": 32767424.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 112.65625, + "completions/mean_terminated_length": 112.65625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.16295184201408938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.005224540120252641, + "learning_rate": 5.1799999999999995e-06, + "loss": 0.0002, + "num_tokens": 32780533.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 63.21875, + "completions/mean_terminated_length": 63.21875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.16306732879085345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.009598833821655717, + "learning_rate": 5.177999999999999e-06, + "loss": 0.0004, + "num_tokens": 32801084.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 291.15625, + "completions/mean_terminated_length": 291.15625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.16318281556761752, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.013366147453780286, + "learning_rate": 5.176e-06, + "loss": 0.0005, + "num_tokens": 32842241.0, + "reward": 3.1464295387268066, + "reward_std": 0.32794174551963806, + "rewards/reward_fn/mean": 3.1464295387268066, + "rewards/reward_fn/std": 0.32794174551963806, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1265.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 296.34375, + "completions/mean_terminated_length": 296.34375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.16329830234438156, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.012310305224673357, + "learning_rate": 5.174e-06, + "loss": 0.0005, + "num_tokens": 32875084.0, + "reward": 2.5495004653930664, + "reward_std": 0.4733097553253174, + "rewards/reward_fn/mean": 2.5495004653930664, + "rewards/reward_fn/std": 0.47330978512763977, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 147.28125, + "completions/mean_terminated_length": 147.28125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.16341378912114563, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.015575832381728105, + "learning_rate": 5.172e-06, + "loss": 0.0006, + "num_tokens": 32902773.0, + "reward": 3.7026848793029785, + "reward_std": 0.20975418388843536, + "rewards/reward_fn/mean": 3.7026848793029785, + "rewards/reward_fn/std": 0.20975418388843536, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 252.5625, + "completions/mean_terminated_length": 252.5625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.1635292758979097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.0125292933662422, + "learning_rate": 5.17e-06, + "loss": 0.0005, + "num_tokens": 32932711.0, + "reward": 3.46657657623291, + "reward_std": 0.5231244564056396, + "rewards/reward_fn/mean": 3.46657657623291, + "rewards/reward_fn/std": 0.5231243968009949, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 133.78125, + "completions/mean_terminated_length": 133.78125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.16364476267467376, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.014040815614862368, + "learning_rate": 5.168e-06, + "loss": 0.0006, + "num_tokens": 32960512.0, + "reward": 3.7873003482818604, + "reward_std": 0.3761400878429413, + "rewards/reward_fn/mean": 3.7873003482818604, + "rewards/reward_fn/std": 0.37614014744758606, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 226.0625, + "completions/mean_terminated_length": 226.0625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.1637602494514378, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.012719824102532584, + "learning_rate": 5.166e-06, + "loss": 0.0005, + "num_tokens": 32979202.0, + "reward": 3.635244846343994, + "reward_std": 0.861204206943512, + "rewards/reward_fn/mean": 3.635244846343994, + "rewards/reward_fn/std": 0.861204206943512, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 80.5, + "completions/mean_terminated_length": 80.5, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.16387573622820187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.012817712828109507, + "learning_rate": 5.163999999999999e-06, + "loss": 0.0005, + "num_tokens": 33003314.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 97.34375, + "completions/mean_terminated_length": 97.34375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.16399122300496594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.009733124988997588, + "learning_rate": 5.162e-06, + "loss": 0.0004, + "num_tokens": 33018877.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 257.96875, + "completions/mean_terminated_length": 257.96875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.16410670978173, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.0130471974844113, + "learning_rate": 5.16e-06, + "loss": 0.0005, + "num_tokens": 33042620.0, + "reward": 3.912297010421753, + "reward_std": 0.27749159932136536, + "rewards/reward_fn/mean": 3.912297010421753, + "rewards/reward_fn/std": 0.27749159932136536, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 114.15625, + "completions/mean_terminated_length": 114.15625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.16422219655849404, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "kl": 0.013811227763653733, + "learning_rate": 5.158e-06, + "loss": 0.0006, + "num_tokens": 33066977.0, + "reward": 3.5286026000976562, + "reward_std": 0.4350418746471405, + "rewards/reward_fn/mean": 3.5286026000976562, + "rewards/reward_fn/std": 0.4350419044494629, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 115.71875, + "completions/mean_terminated_length": 115.71875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.1643376833352581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.010190700340899639, + "learning_rate": 5.1559999999999994e-06, + "loss": 0.0004, + "num_tokens": 33083256.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 80.25, + "completions/mean_terminated_length": 80.25, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.16445317011202218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.013552918419009075, + "learning_rate": 5.153999999999999e-06, + "loss": 0.0005, + "num_tokens": 33112640.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 234.0625, + "completions/mean_terminated_length": 234.0625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.16456865688878625, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.018502566759707406, + "learning_rate": 5.152e-06, + "loss": 0.0007, + "num_tokens": 33140226.0, + "reward": 3.7771925926208496, + "reward_std": 0.3608176112174988, + "rewards/reward_fn/mean": 3.7771925926208496, + "rewards/reward_fn/std": 0.3608176112174988, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 155.03125, + "completions/mean_terminated_length": 155.03125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.1646841436655503, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.01407915967865847, + "learning_rate": 5.15e-06, + "loss": 0.0006, + "num_tokens": 33154691.0, + "reward": 3.650594711303711, + "reward_std": 0.47618943452835083, + "rewards/reward_fn/mean": 3.650594711303711, + "rewards/reward_fn/std": 0.47618943452835083, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 188.28125, + "completions/mean_terminated_length": 188.28125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.16479963044231435, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.01991765439743176, + "learning_rate": 5.148e-06, + "loss": 0.0008, + "num_tokens": 33179820.0, + "reward": 3.2486019134521484, + "reward_std": 0.4476420283317566, + "rewards/reward_fn/mean": 3.2486019134521484, + "rewards/reward_fn/std": 0.4476419985294342, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 188.8125, + "completions/mean_terminated_length": 188.8125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.16491511721907842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051513671875, + "kl": 0.009613679416361265, + "learning_rate": 5.1459999999999995e-06, + "loss": 0.0004, + "num_tokens": 33204198.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 319.8125, + "completions/mean_terminated_length": 319.8125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.1650306039958425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.010888098448049277, + "learning_rate": 5.144e-06, + "loss": 0.0004, + "num_tokens": 33230656.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 96.09375, + "completions/mean_terminated_length": 96.09375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.16514609077260653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.016716078876925167, + "learning_rate": 5.142e-06, + "loss": 0.0007, + "num_tokens": 33252227.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 205.03125, + "completions/mean_terminated_length": 205.03125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.1652615775493706, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.015559835359454155, + "learning_rate": 5.139999999999999e-06, + "loss": 0.0006, + "num_tokens": 33281092.0, + "reward": 3.8342700004577637, + "reward_std": 0.5769116878509521, + "rewards/reward_fn/mean": 3.8342700004577637, + "rewards/reward_fn/std": 0.5769116282463074, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 203.21875, + "completions/mean_terminated_length": 203.21875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.16537706432613467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.013086414794088341, + "learning_rate": 5.138e-06, + "loss": 0.0005, + "num_tokens": 33300107.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 193.125, + "completions/mean_terminated_length": 193.125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.1654925511028987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.0076673409275827, + "learning_rate": 5.136e-06, + "loss": 0.0003, + "num_tokens": 33322607.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 436.53125, + "completions/mean_terminated_length": 436.53125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.16560803787966277, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.009478812353336252, + "learning_rate": 5.134e-06, + "loss": 0.0004, + "num_tokens": 33357728.0, + "reward": 3.5438852310180664, + "reward_std": 0.7019700407981873, + "rewards/reward_fn/mean": 3.5438852310180664, + "rewards/reward_fn/std": 0.7019700407981873, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 326.375, + "completions/mean_terminated_length": 326.375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.16572352465642684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037353515625, + "kl": 0.007936169939057436, + "learning_rate": 5.131999999999999e-06, + "loss": 0.0003, + "num_tokens": 33391020.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 245.15625, + "completions/mean_terminated_length": 245.15625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.1658390114331909, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.009463067093747668, + "learning_rate": 5.13e-06, + "loss": 0.0004, + "num_tokens": 33419121.0, + "reward": 2.875103712081909, + "reward_std": 0.3653436303138733, + "rewards/reward_fn/mean": 2.875103712081909, + "rewards/reward_fn/std": 0.36534368991851807, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 79.34375, + "completions/mean_terminated_length": 79.34375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.16595449820995495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.010164355118831736, + "learning_rate": 5.128e-06, + "loss": 0.0004, + "num_tokens": 33438236.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 130.09375, + "completions/mean_terminated_length": 130.09375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.16606998498671902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.009092920539842453, + "learning_rate": 5.126e-06, + "loss": 0.0004, + "num_tokens": 33453247.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 90.3125, + "completions/mean_terminated_length": 90.3125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.16618547176348308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2041015625, + "kl": 0.01780278349906439, + "learning_rate": 5.1239999999999996e-06, + "loss": 0.0007, + "num_tokens": 33469801.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 303.71875, + "completions/mean_terminated_length": 303.71875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.16630095854024715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.011718078989360947, + "learning_rate": 5.121999999999999e-06, + "loss": 0.0005, + "num_tokens": 33498720.0, + "reward": 3.1295342445373535, + "reward_std": 0.9025193452835083, + "rewards/reward_fn/mean": 3.1295342445373535, + "rewards/reward_fn/std": 0.9025192856788635, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 304.46875, + "completions/mean_terminated_length": 304.46875, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.1664164453170112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.00976366744725965, + "learning_rate": 5.12e-06, + "loss": 0.0004, + "num_tokens": 33536143.0, + "reward": 3.338778018951416, + "reward_std": 0.2700076997280121, + "rewards/reward_fn/mean": 3.338778018951416, + "rewards/reward_fn/std": 0.2700077295303345, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 88.6875, + "completions/mean_terminated_length": 88.6875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.16653193209377526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.003629199198257993, + "learning_rate": 5.118e-06, + "loss": 0.0001, + "num_tokens": 33557893.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 180.375, + "completions/mean_terminated_length": 180.375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.16664741887053933, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.009933225614076946, + "learning_rate": 5.116e-06, + "loss": 0.0004, + "num_tokens": 33585649.0, + "reward": 3.941239595413208, + "reward_std": 0.2312232404947281, + "rewards/reward_fn/mean": 3.941239595413208, + "rewards/reward_fn/std": 0.2312232255935669, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 80.5, + "completions/mean_terminated_length": 80.5, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.1667629056473034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.01302891704835929, + "learning_rate": 5.114e-06, + "loss": 0.0005, + "num_tokens": 33603905.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 337.1875, + "completions/mean_terminated_length": 337.1875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.16687839242406743, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.009241328960342798, + "learning_rate": 5.1119999999999995e-06, + "loss": 0.0004, + "num_tokens": 33626727.0, + "reward": 2.8979334831237793, + "reward_std": 0.3262321650981903, + "rewards/reward_fn/mean": 2.8979334831237793, + "rewards/reward_fn/std": 0.3262321650981903, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 99.53125, + "completions/mean_terminated_length": 99.53125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.1669938792008315, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.010674518845917191, + "learning_rate": 5.11e-06, + "loss": 0.0004, + "num_tokens": 33652152.0, + "reward": 3.9397382736206055, + "reward_std": 0.23751422762870789, + "rewards/reward_fn/mean": 3.9397382736206055, + "rewards/reward_fn/std": 0.2375142127275467, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 333.40625, + "completions/mean_terminated_length": 333.40625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.16710936597759557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.011559099177247845, + "learning_rate": 5.107999999999999e-06, + "loss": 0.0005, + "num_tokens": 33677829.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 218.875, + "completions/mean_terminated_length": 218.875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.16722485275435964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.011533924196555745, + "learning_rate": 5.106e-06, + "loss": 0.0005, + "num_tokens": 33699809.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 318.15625, + "completions/mean_terminated_length": 318.15625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.16734033953112368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.01171531768341083, + "learning_rate": 5.104e-06, + "loss": 0.0005, + "num_tokens": 33722374.0, + "reward": 3.8988218307495117, + "reward_std": 0.4229646921157837, + "rewards/reward_fn/mean": 3.8988218307495117, + "rewards/reward_fn/std": 0.4229647219181061, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.16745582630788775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.011317903627059422, + "learning_rate": 5.1020000000000004e-06, + "loss": 0.0005, + "num_tokens": 33740890.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 393.5, + "completions/mean_terminated_length": 393.5, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.1675713130846518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.010758909527794458, + "learning_rate": 5.0999999999999995e-06, + "loss": 0.0004, + "num_tokens": 33766506.0, + "reward": 3.4616949558258057, + "reward_std": 0.5520514845848083, + "rewards/reward_fn/mean": 3.4616949558258057, + "rewards/reward_fn/std": 0.5520515441894531, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 90.09375, + "completions/mean_terminated_length": 90.09375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.16768679986141588, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.010228288891084958, + "learning_rate": 5.097999999999999e-06, + "loss": 0.0004, + "num_tokens": 33781069.0, + "reward": 3.932429790496826, + "reward_std": 0.38223469257354736, + "rewards/reward_fn/mean": 3.932429790496826, + "rewards/reward_fn/std": 0.382234662771225, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 75.4375, + "completions/mean_terminated_length": 75.4375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.16780228663817992, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.40625, + "kl": 0.018721423592069186, + "learning_rate": 5.096e-06, + "loss": 0.0007, + "num_tokens": 33796123.0, + "reward": 3.9445204734802246, + "reward_std": 0.21969404816627502, + "rewards/reward_fn/mean": 3.9445204734802246, + "rewards/reward_fn/std": 0.21969406306743622, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 135.0, + "completions/mean_terminated_length": 135.0, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.167917773414944, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "kl": 0.012551300627819728, + "learning_rate": 5.094e-06, + "loss": 0.0005, + "num_tokens": 33819547.0, + "reward": 3.9016828536987305, + "reward_std": 0.42882928252220154, + "rewards/reward_fn/mean": 3.9016828536987305, + "rewards/reward_fn/std": 0.42882928252220154, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.16803326019170806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.0077072979547665454, + "learning_rate": 5.092e-06, + "loss": 0.0003, + "num_tokens": 33862923.0, + "reward": 3.647500514984131, + "reward_std": 0.46479907631874084, + "rewards/reward_fn/mean": 3.647500514984131, + "rewards/reward_fn/std": 0.46479904651641846, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 65.25, + "completions/mean_terminated_length": 65.25, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.16814874696847212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.171875, + "kl": 0.013544193374400493, + "learning_rate": 5.0899999999999995e-06, + "loss": 0.0005, + "num_tokens": 33888339.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 163.78125, + "completions/mean_terminated_length": 163.78125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.16826423374523616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.011313504641293548, + "learning_rate": 5.088e-06, + "loss": 0.0005, + "num_tokens": 33917260.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 296.84375, + "completions/mean_terminated_length": 296.84375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.16837972052200023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.014389701682375744, + "learning_rate": 5.086e-06, + "loss": 0.0006, + "num_tokens": 33939335.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 166.0625, + "completions/mean_terminated_length": 166.0625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.1684952072987643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.015568592731142417, + "learning_rate": 5.083999999999999e-06, + "loss": 0.0006, + "num_tokens": 33965385.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 106.65625, + "completions/mean_terminated_length": 106.65625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.16861069407552834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.010060117419925518, + "learning_rate": 5.082e-06, + "loss": 0.0004, + "num_tokens": 33981182.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 250.21875, + "completions/mean_terminated_length": 250.21875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.1687261808522924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.01591378681769129, + "learning_rate": 5.08e-06, + "loss": 0.0006, + "num_tokens": 34002501.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 493.96875, + "completions/mean_terminated_length": 390.36669921875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.16884166762905647, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.013918919175921474, + "learning_rate": 5.078e-06, + "loss": 0.0006, + "num_tokens": 34031108.0, + "reward": 3.313534736633301, + "reward_std": 1.1439014673233032, + "rewards/reward_fn/mean": 3.313534736633301, + "rewards/reward_fn/std": 1.1439014673233032, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 176.0, + "completions/mean_terminated_length": 176.0, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.16895715440582054, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.011851076749735512, + "learning_rate": 5.075999999999999e-06, + "loss": 0.0005, + "num_tokens": 34055716.0, + "reward": 3.244927406311035, + "reward_std": 0.025491951033473015, + "rewards/reward_fn/mean": 3.244927406311035, + "rewards/reward_fn/std": 0.02549191750586033, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 137.3125, + "completions/mean_terminated_length": 137.3125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.16907264118258458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.01005933581473073, + "learning_rate": 5.073999999999999e-06, + "loss": 0.0004, + "num_tokens": 34080750.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 348.28125, + "completions/mean_terminated_length": 348.28125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.16918812795934865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.011598434095503762, + "learning_rate": 5.072e-06, + "loss": 0.0005, + "num_tokens": 34114903.0, + "reward": 3.5389928817749023, + "reward_std": 0.8710470199584961, + "rewards/reward_fn/mean": 3.5389928817749023, + "rewards/reward_fn/std": 0.8710470199584961, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 252.34375, + "completions/mean_terminated_length": 252.34375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.16930361473611272, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.02172233311284799, + "learning_rate": 5.07e-06, + "loss": 0.0009, + "num_tokens": 34143490.0, + "reward": 3.4884095191955566, + "reward_std": 0.489868700504303, + "rewards/reward_fn/mean": 3.4884095191955566, + "rewards/reward_fn/std": 0.48986876010894775, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 175.90625, + "completions/mean_terminated_length": 175.90625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.16941910151287679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.015610597067279741, + "learning_rate": 5.068e-06, + "loss": 0.0006, + "num_tokens": 34174207.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 70.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 53.34375, + "completions/mean_terminated_length": 53.34375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.16953458828964083, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.46875, + "kl": 0.018884696168242954, + "learning_rate": 5.0659999999999994e-06, + "loss": 0.0008, + "num_tokens": 34191818.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 188.375, + "completions/mean_terminated_length": 188.375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.1696500750664049, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.013608283901703544, + "learning_rate": 5.064e-06, + "loss": 0.0005, + "num_tokens": 34222902.0, + "reward": 3.758951187133789, + "reward_std": 0.4268967807292938, + "rewards/reward_fn/mean": 3.758951187133789, + "rewards/reward_fn/std": 0.42689675092697144, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 113.3125, + "completions/mean_terminated_length": 113.3125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.16976556184316896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.007413507863020641, + "learning_rate": 5.062e-06, + "loss": 0.0003, + "num_tokens": 34237664.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 146.78125, + "completions/mean_terminated_length": 146.78125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.16988104861993303, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.009537438927509356, + "learning_rate": 5.059999999999999e-06, + "loss": 0.0004, + "num_tokens": 34262809.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 101.40625, + "completions/mean_terminated_length": 101.40625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.16999653539669707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.012886630669527221, + "learning_rate": 5.058e-06, + "loss": 0.0005, + "num_tokens": 34284102.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 105.96875, + "completions/mean_terminated_length": 105.96875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.17011202217346114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.01117290006368421, + "learning_rate": 5.0559999999999995e-06, + "loss": 0.0004, + "num_tokens": 34304293.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 56.625, + "completions/mean_terminated_length": 56.625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.1702275089502252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.008192737102945102, + "learning_rate": 5.054e-06, + "loss": 0.0003, + "num_tokens": 34314169.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 205.71875, + "completions/mean_terminated_length": 205.71875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.17034299572698927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.011116249777842313, + "learning_rate": 5.051999999999999e-06, + "loss": 0.0004, + "num_tokens": 34333360.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 208.5625, + "completions/mean_terminated_length": 208.5625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.1704584825037533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.014143482847430278, + "learning_rate": 5.05e-06, + "loss": 0.0006, + "num_tokens": 34359906.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 175.9375, + "completions/mean_terminated_length": 175.9375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.17057396928051738, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.01715507921471726, + "learning_rate": 5.048e-06, + "loss": 0.0007, + "num_tokens": 34376736.0, + "reward": 3.554102897644043, + "reward_std": 0.42781955003738403, + "rewards/reward_fn/mean": 3.554102897644043, + "rewards/reward_fn/std": 0.42781952023506165, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 213.4375, + "completions/mean_terminated_length": 213.4375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.17068945605728145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.012048730728565715, + "learning_rate": 5.046e-06, + "loss": 0.0005, + "num_tokens": 34395854.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 70.03125, + "completions/mean_terminated_length": 70.03125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.17080494283404551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.009218921479259734, + "learning_rate": 5.0439999999999995e-06, + "loss": 0.0004, + "num_tokens": 34410095.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 240.84375, + "completions/mean_terminated_length": 240.84375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.17092042961080955, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.014718743899720721, + "learning_rate": 5.041999999999999e-06, + "loss": 0.0006, + "num_tokens": 34437802.0, + "reward": 3.9287750720977783, + "reward_std": 0.20115776360034943, + "rewards/reward_fn/mean": 3.9287750720977783, + "rewards/reward_fn/std": 0.2011578232049942, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 226.84375, + "completions/mean_terminated_length": 226.84375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.17103591638757362, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.008682812316692434, + "learning_rate": 5.04e-06, + "loss": 0.0003, + "num_tokens": 34463845.0, + "reward": 3.926265239715576, + "reward_std": 0.4171067476272583, + "rewards/reward_fn/mean": 3.926265239715576, + "rewards/reward_fn/std": 0.4171067476272583, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 112.84375, + "completions/mean_terminated_length": 112.84375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.1711514031643377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.012129517257562838, + "learning_rate": 5.038e-06, + "loss": 0.0005, + "num_tokens": 34486400.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 149.28125, + "completions/mean_terminated_length": 149.28125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.17126688994110176, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.016433918550319504, + "learning_rate": 5.036e-06, + "loss": 0.0007, + "num_tokens": 34505129.0, + "reward": 3.8098652362823486, + "reward_std": 0.36557501554489136, + "rewards/reward_fn/mean": 3.8098652362823486, + "rewards/reward_fn/std": 0.36557504534721375, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 136.53125, + "completions/mean_terminated_length": 136.53125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.1713823767178658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.010500548232812434, + "learning_rate": 5.0339999999999996e-06, + "loss": 0.0004, + "num_tokens": 34521402.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 240.0625, + "completions/mean_terminated_length": 240.0625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.17149786349462987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.01791628816863522, + "learning_rate": 5.0319999999999994e-06, + "loss": 0.0007, + "num_tokens": 34552412.0, + "reward": 3.9782166481018066, + "reward_std": 0.12322542071342468, + "rewards/reward_fn/mean": 3.9782166481018066, + "rewards/reward_fn/std": 0.12322545051574707, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 191.84375, + "completions/mean_terminated_length": 191.84375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.17161335027139393, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.013314351759618148, + "learning_rate": 5.03e-06, + "loss": 0.0005, + "num_tokens": 34575607.0, + "reward": 3.9552812576293945, + "reward_std": 0.18035712838172913, + "rewards/reward_fn/mean": 3.9552812576293945, + "rewards/reward_fn/std": 0.18035712838172913, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 195.125, + "completions/mean_terminated_length": 195.125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.17172883704815797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.009394441483891569, + "learning_rate": 5.027999999999999e-06, + "loss": 0.0004, + "num_tokens": 34594203.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 386.3125, + "completions/mean_terminated_length": 386.3125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.17184432382492204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.400390625, + "kl": 0.017371140595059842, + "learning_rate": 5.026e-06, + "loss": 0.0007, + "num_tokens": 34621957.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 113.46875, + "completions/mean_terminated_length": 113.46875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.1719598106016861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.006953953045012895, + "learning_rate": 5.024e-06, + "loss": 0.0003, + "num_tokens": 34652212.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 281.4375, + "completions/mean_terminated_length": 281.4375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.17207529737845018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.012528109480626881, + "learning_rate": 5.022e-06, + "loss": 0.0005, + "num_tokens": 34676514.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 188.75, + "completions/mean_terminated_length": 188.75, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.17219078415521422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.009590132867742795, + "learning_rate": 5.019999999999999e-06, + "loss": 0.0004, + "num_tokens": 34702042.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 174.0, + "completions/mean_terminated_length": 174.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.17230627093197828, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.34375, + "kl": 0.014434514116146602, + "learning_rate": 5.017999999999999e-06, + "loss": 0.0006, + "num_tokens": 34731674.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 81.3125, + "completions/mean_terminated_length": 81.3125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.17242175770874235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.019872604694683105, + "learning_rate": 5.016e-06, + "loss": 0.0008, + "num_tokens": 34759236.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 273.75, + "completions/mean_terminated_length": 273.75, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.17253724448550642, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.01272648801386822, + "learning_rate": 5.014e-06, + "loss": 0.0005, + "num_tokens": 34790140.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1442.0, + "completions/max_terminated_length": 1442.0, + "completions/mean_length": 169.03125, + "completions/mean_terminated_length": 169.03125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.17265273126227046, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8125, + "kl": 0.009113792402786203, + "learning_rate": 5.012e-06, + "loss": 0.0004, + "num_tokens": 34806909.0, + "reward": 3.928697347640991, + "reward_std": 0.4033488631248474, + "rewards/reward_fn/mean": 3.928697347640991, + "rewards/reward_fn/std": 0.403348833322525, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 477.4375, + "completions/mean_terminated_length": 477.4375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.17276821803903453, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.009775388745765667, + "learning_rate": 5.0099999999999995e-06, + "loss": 0.0004, + "num_tokens": 34844875.0, + "reward": 3.0571725368499756, + "reward_std": 0.6702634692192078, + "rewards/reward_fn/mean": 3.0571725368499756, + "rewards/reward_fn/std": 0.6702633500099182, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 286.25, + "completions/mean_terminated_length": 286.25, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.1728837048157986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.014437593432376161, + "learning_rate": 5.008e-06, + "loss": 0.0006, + "num_tokens": 34867059.0, + "reward": 3.642606735229492, + "reward_std": 0.8438201546669006, + "rewards/reward_fn/mean": 3.642606735229492, + "rewards/reward_fn/std": 0.8438201546669006, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 188.15625, + "completions/mean_terminated_length": 188.15625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.17299919159256266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.01205385031789774, + "learning_rate": 5.006e-06, + "loss": 0.0005, + "num_tokens": 34885432.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 214.90625, + "completions/mean_terminated_length": 214.90625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.1731146783693267, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.011310947957099415, + "learning_rate": 5.003999999999999e-06, + "loss": 0.0005, + "num_tokens": 34905333.0, + "reward": 3.934922218322754, + "reward_std": 0.25609108805656433, + "rewards/reward_fn/mean": 3.934922218322754, + "rewards/reward_fn/std": 0.25609102845191956, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 81.5625, + "completions/mean_terminated_length": 81.5625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.17323016514609077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.011319865050609224, + "learning_rate": 5.002e-06, + "loss": 0.0005, + "num_tokens": 34927239.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 175.8125, + "completions/mean_terminated_length": 175.8125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.17334565192285484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.018809136498020962, + "learning_rate": 4.9999999999999996e-06, + "loss": 0.0008, + "num_tokens": 34953217.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 97.28125, + "completions/mean_terminated_length": 97.28125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.1734611386996189, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.00951177571550943, + "learning_rate": 4.998e-06, + "loss": 0.0004, + "num_tokens": 34969546.0, + "reward": 3.9297335147857666, + "reward_std": 0.3974871337413788, + "rewards/reward_fn/mean": 3.9297335147857666, + "rewards/reward_fn/std": 0.397487074136734, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 153.59375, + "completions/mean_terminated_length": 153.59375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.17357662547638295, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.012473574242903851, + "learning_rate": 4.996e-06, + "loss": 0.0005, + "num_tokens": 34987165.0, + "reward": 3.7352747917175293, + "reward_std": 0.39962586760520935, + "rewards/reward_fn/mean": 3.7352747917175293, + "rewards/reward_fn/std": 0.3996259272098541, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 54.0625, + "completions/mean_terminated_length": 54.0625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.173692112253147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.006560814301337814, + "learning_rate": 4.994e-06, + "loss": 0.0003, + "num_tokens": 35004447.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 98.625, + "completions/mean_terminated_length": 98.625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.17380759902991108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.009251270450477023, + "learning_rate": 4.992e-06, + "loss": 0.0004, + "num_tokens": 35030515.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 63.875, + "completions/mean_terminated_length": 63.875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.17392308580667515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.010481028752110433, + "learning_rate": 4.99e-06, + "loss": 0.0004, + "num_tokens": 35054127.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 107.375, + "completions/mean_terminated_length": 107.375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.1740385725834392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.015241847591823898, + "learning_rate": 4.988e-06, + "loss": 0.0006, + "num_tokens": 35068667.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 248.78125, + "completions/mean_terminated_length": 248.78125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.17415405936020326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.012516976188635454, + "learning_rate": 4.985999999999999e-06, + "loss": 0.0005, + "num_tokens": 35089396.0, + "reward": 3.5819058418273926, + "reward_std": 0.8842886090278625, + "rewards/reward_fn/mean": 3.5819058418273926, + "rewards/reward_fn/std": 0.8842885494232178, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 170.9375, + "completions/mean_terminated_length": 170.9375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.17426954613696732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051513671875, + "kl": 0.0076431850757217035, + "learning_rate": 4.984e-06, + "loss": 0.0003, + "num_tokens": 35121362.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 88.71875, + "completions/mean_terminated_length": 88.71875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.1743850329137314, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "kl": 0.018334413369302638, + "learning_rate": 4.982e-06, + "loss": 0.0007, + "num_tokens": 35143081.0, + "reward": 3.348668098449707, + "reward_std": 0.04927581176161766, + "rewards/reward_fn/mean": 3.348668098449707, + "rewards/reward_fn/std": 0.04927579313516617, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 63.375, + "completions/mean_terminated_length": 63.375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.17450051969049543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.007731871122814482, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0003, + "num_tokens": 35165877.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 125.84375, + "completions/mean_terminated_length": 125.84375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.1746160064672595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.010436229866172653, + "learning_rate": 4.978e-06, + "loss": 0.0004, + "num_tokens": 35181360.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 138.90625, + "completions/mean_terminated_length": 138.90625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.17473149324402357, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.009471396027947776, + "learning_rate": 4.9759999999999995e-06, + "loss": 0.0004, + "num_tokens": 35197869.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 157.28125, + "completions/mean_terminated_length": 157.28125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.1748469800207876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.012078763553290628, + "learning_rate": 4.974e-06, + "loss": 0.0005, + "num_tokens": 35212918.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 251.03125, + "completions/mean_terminated_length": 251.03125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.17496246679755167, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.011156738837598823, + "learning_rate": 4.972e-06, + "loss": 0.0004, + "num_tokens": 35244247.0, + "reward": 3.843135356903076, + "reward_std": 0.4698220193386078, + "rewards/reward_fn/mean": 3.843135356903076, + "rewards/reward_fn/std": 0.4698220491409302, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 78.21875, + "completions/mean_terminated_length": 78.21875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.17507795357431574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.01402955524099525, + "learning_rate": 4.97e-06, + "loss": 0.0006, + "num_tokens": 35266398.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 130.03125, + "completions/mean_terminated_length": 130.03125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.1751934403510798, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.01314035446557682, + "learning_rate": 4.968e-06, + "loss": 0.0005, + "num_tokens": 35287711.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 182.5625, + "completions/mean_terminated_length": 182.5625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.17530892712784385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.01104440851486288, + "learning_rate": 4.9659999999999995e-06, + "loss": 0.0004, + "num_tokens": 35315633.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 170.65625, + "completions/mean_terminated_length": 170.65625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.17542441390460792, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.023638754821149632, + "learning_rate": 4.964e-06, + "loss": 0.0009, + "num_tokens": 35342342.0, + "reward": 3.6532225608825684, + "reward_std": 0.3572375476360321, + "rewards/reward_fn/mean": 3.6532225608825684, + "rewards/reward_fn/std": 0.3572375774383545, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 66.34375, + "completions/mean_terminated_length": 66.34375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.17553990068137199, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.009092947089811787, + "learning_rate": 4.961999999999999e-06, + "loss": 0.0004, + "num_tokens": 35357169.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 66.125, + "completions/mean_terminated_length": 66.125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.17565538745813605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.007508410292757617, + "learning_rate": 4.96e-06, + "loss": 0.0003, + "num_tokens": 35381685.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 75.5625, + "completions/mean_terminated_length": 75.5625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.1757708742349001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.004190903790004086, + "learning_rate": 4.958e-06, + "loss": 0.0002, + "num_tokens": 35392359.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 373.84375, + "completions/mean_terminated_length": 373.84375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.17588636101166416, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.01046387536189286, + "learning_rate": 4.9560000000000005e-06, + "loss": 0.0004, + "num_tokens": 35420258.0, + "reward": 3.8906521797180176, + "reward_std": 0.4515727162361145, + "rewards/reward_fn/mean": 3.8906521797180176, + "rewards/reward_fn/std": 0.4515726566314697, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 242.9375, + "completions/mean_terminated_length": 242.9375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.17600184778842823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.020398217733600177, + "learning_rate": 4.9539999999999995e-06, + "loss": 0.0008, + "num_tokens": 35450144.0, + "reward": 3.3888912200927734, + "reward_std": 0.6731318831443787, + "rewards/reward_fn/mean": 3.3888912200927734, + "rewards/reward_fn/std": 0.6731318831443787, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 304.65625, + "completions/mean_terminated_length": 304.65625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.1761173345651923, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.015455970278708264, + "learning_rate": 4.951999999999999e-06, + "loss": 0.0006, + "num_tokens": 35475349.0, + "reward": 3.929544448852539, + "reward_std": 0.39855676889419556, + "rewards/reward_fn/mean": 3.929544448852539, + "rewards/reward_fn/std": 0.39855676889419556, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 249.28125, + "completions/mean_terminated_length": 249.28125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.17623282134195634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.012518298113718629, + "learning_rate": 4.95e-06, + "loss": 0.0005, + "num_tokens": 35494814.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 259.75, + "completions/mean_terminated_length": 259.75, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.1763483081187204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.0077184620531625114, + "learning_rate": 4.948e-06, + "loss": 0.0003, + "num_tokens": 35527446.0, + "reward": 3.730759620666504, + "reward_std": 0.38243016600608826, + "rewards/reward_fn/mean": 3.730759620666504, + "rewards/reward_fn/std": 0.38243016600608826, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 85.0625, + "completions/mean_terminated_length": 85.0625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.17646379489548447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.009969974598789122, + "learning_rate": 4.946e-06, + "loss": 0.0004, + "num_tokens": 35543704.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 72.9375, + "completions/mean_terminated_length": 72.9375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.17657928167224854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.005499422451975988, + "learning_rate": 4.944e-06, + "loss": 0.0002, + "num_tokens": 35571350.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 192.3125, + "completions/mean_terminated_length": 192.3125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.17669476844901258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.008400430670008063, + "learning_rate": 4.942e-06, + "loss": 0.0003, + "num_tokens": 35608800.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 202.5625, + "completions/mean_terminated_length": 202.5625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.17681025522577665, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.014897677261615172, + "learning_rate": 4.94e-06, + "loss": 0.0006, + "num_tokens": 35626258.0, + "reward": 3.9298808574676514, + "reward_std": 0.396653413772583, + "rewards/reward_fn/mean": 3.9298808574676514, + "rewards/reward_fn/std": 0.3966533839702606, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 99.53125, + "completions/mean_terminated_length": 99.53125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.17692574200254071, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.008095404191408306, + "learning_rate": 4.937999999999999e-06, + "loss": 0.0003, + "num_tokens": 35654275.0, + "reward": 3.643497943878174, + "reward_std": 0.4683251976966858, + "rewards/reward_fn/mean": 3.643497943878174, + "rewards/reward_fn/std": 0.4683252274990082, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 239.4375, + "completions/mean_terminated_length": 239.4375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.17704122877930478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.015264915564330295, + "learning_rate": 4.936e-06, + "loss": 0.0006, + "num_tokens": 35674481.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 375.71875, + "completions/mean_terminated_length": 375.71875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.17715671555606882, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.012625970237422734, + "learning_rate": 4.934e-06, + "loss": 0.0005, + "num_tokens": 35711368.0, + "reward": 3.349716901779175, + "reward_std": 0.8274109959602356, + "rewards/reward_fn/mean": 3.349716901779175, + "rewards/reward_fn/std": 0.8274109959602356, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 291.71875, + "completions/mean_terminated_length": 291.71875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.1772722023328329, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.011283890475169756, + "learning_rate": 4.932e-06, + "loss": 0.0005, + "num_tokens": 35743679.0, + "reward": 3.966594696044922, + "reward_std": 0.18896877765655518, + "rewards/reward_fn/mean": 3.966594696044922, + "rewards/reward_fn/std": 0.18896876275539398, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 189.34375, + "completions/mean_terminated_length": 189.34375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.17738768910959696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.013379087919020094, + "learning_rate": 4.929999999999999e-06, + "loss": 0.0005, + "num_tokens": 35761770.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 151.4375, + "completions/mean_terminated_length": 151.4375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.17750317588636103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.010632951220031828, + "learning_rate": 4.928e-06, + "loss": 0.0004, + "num_tokens": 35788920.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 131.96875, + "completions/mean_terminated_length": 131.96875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.17761866266312507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12109375, + "kl": 0.01614202428027056, + "learning_rate": 4.926e-06, + "loss": 0.0006, + "num_tokens": 35805271.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 501.59375, + "completions/mean_terminated_length": 451.70965576171875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.17773414943988913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.014262836542911828, + "learning_rate": 4.924e-06, + "loss": 0.0006, + "num_tokens": 35833194.0, + "reward": 2.633406639099121, + "reward_std": 1.2585886716842651, + "rewards/reward_fn/mean": 2.633406639099121, + "rewards/reward_fn/std": 1.2585886716842651, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 69.71875, + "completions/mean_terminated_length": 69.71875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.1778496362166532, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.011625735445704777, + "learning_rate": 4.922e-06, + "loss": 0.0005, + "num_tokens": 35850401.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 203.25, + "completions/mean_terminated_length": 203.25, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.17796512299341724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638671875, + "kl": 0.008498681709170341, + "learning_rate": 4.9199999999999995e-06, + "loss": 0.0003, + "num_tokens": 35876169.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 82.4375, + "completions/mean_terminated_length": 82.4375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.1780806097701813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.008779815791058354, + "learning_rate": 4.918e-06, + "loss": 0.0004, + "num_tokens": 35889303.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 63.5625, + "completions/mean_terminated_length": 63.5625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.17819609654694538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.0055325915818684734, + "learning_rate": 4.916e-06, + "loss": 0.0002, + "num_tokens": 35902953.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 142.9375, + "completions/mean_terminated_length": 142.9375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.17831158332370944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.011405782723159064, + "learning_rate": 4.914e-06, + "loss": 0.0005, + "num_tokens": 35925607.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 157.875, + "completions/mean_terminated_length": 157.875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.17842707010047348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.011148897450766526, + "learning_rate": 4.912e-06, + "loss": 0.0004, + "num_tokens": 35943363.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 75.4375, + "completions/mean_terminated_length": 75.4375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.17854255687723755, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.484375, + "kl": 0.020137341940426268, + "learning_rate": 4.91e-06, + "loss": 0.0008, + "num_tokens": 35962545.0, + "reward": 3.40544056892395, + "reward_std": 0.02328232303261757, + "rewards/reward_fn/mean": 3.40544056892395, + "rewards/reward_fn/std": 0.02328234352171421, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 130.28125, + "completions/mean_terminated_length": 130.28125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.17865804365400162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.00868732135859318, + "learning_rate": 4.908e-06, + "loss": 0.0003, + "num_tokens": 35979290.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 121.78125, + "completions/mean_terminated_length": 121.78125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.1787735304307657, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.014709172421135008, + "learning_rate": 4.905999999999999e-06, + "loss": 0.0006, + "num_tokens": 36003315.0, + "reward": 3.9819436073303223, + "reward_std": 0.10214319825172424, + "rewards/reward_fn/mean": 3.9819436073303223, + "rewards/reward_fn/std": 0.10214322060346603, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 145.8125, + "completions/mean_terminated_length": 145.8125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.17888901720752973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.011108118800621014, + "learning_rate": 4.904e-06, + "loss": 0.0004, + "num_tokens": 36020269.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 275.78125, + "completions/mean_terminated_length": 275.78125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.1790045039842938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.009707677134429105, + "learning_rate": 4.902e-06, + "loss": 0.0004, + "num_tokens": 36044358.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 423.84375, + "completions/mean_terminated_length": 423.84375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.17911999076105786, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.011164276977069676, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0004, + "num_tokens": 36069825.0, + "reward": 3.7207741737365723, + "reward_std": 0.7505890727043152, + "rewards/reward_fn/mean": 3.7207741737365723, + "rewards/reward_fn/std": 0.75058913230896, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 98.15625, + "completions/mean_terminated_length": 98.15625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.17923547753782193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1767578125, + "kl": 0.026202144974377006, + "learning_rate": 4.8979999999999995e-06, + "loss": 0.001, + "num_tokens": 36084166.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 393.625, + "completions/mean_terminated_length": 393.625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.17935096431458597, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.011570349903195165, + "learning_rate": 4.895999999999999e-06, + "loss": 0.0005, + "num_tokens": 36119610.0, + "reward": 3.2726781368255615, + "reward_std": 0.5004549026489258, + "rewards/reward_fn/mean": 3.2726781368255615, + "rewards/reward_fn/std": 0.5004549026489258, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 104.28125, + "completions/mean_terminated_length": 104.28125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.17946645109135004, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.00725434509513434, + "learning_rate": 4.894e-06, + "loss": 0.0003, + "num_tokens": 36148419.0, + "reward": 3.971043348312378, + "reward_std": 0.16380394995212555, + "rewards/reward_fn/mean": 3.971043348312378, + "rewards/reward_fn/std": 0.16380397975444794, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 84.8125, + "completions/mean_terminated_length": 84.8125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.1795819378681141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.009045204213180114, + "learning_rate": 4.892e-06, + "loss": 0.0004, + "num_tokens": 36164989.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 101.28125, + "completions/mean_terminated_length": 101.28125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.17969742464487817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.006562016933457926, + "learning_rate": 4.89e-06, + "loss": 0.0003, + "num_tokens": 36180646.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 315.8125, + "completions/mean_terminated_length": 315.8125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.1798129114216422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.012211218985612504, + "learning_rate": 4.888e-06, + "loss": 0.0005, + "num_tokens": 36209920.0, + "reward": 3.377922534942627, + "reward_std": 0.49712690711021423, + "rewards/reward_fn/mean": 3.377922534942627, + "rewards/reward_fn/std": 0.497126966714859, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 333.59375, + "completions/mean_terminated_length": 333.59375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.17992839819840628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.010556107954471372, + "learning_rate": 4.886e-06, + "loss": 0.0004, + "num_tokens": 36236691.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 92.40625, + "completions/mean_terminated_length": 92.40625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.18004388497517035, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "kl": 0.015312945048208348, + "learning_rate": 4.884e-06, + "loss": 0.0006, + "num_tokens": 36260512.0, + "reward": 3.6679821014404297, + "reward_std": 0.08620647341012955, + "rewards/reward_fn/mean": 3.6679821014404297, + "rewards/reward_fn/std": 0.08620647341012955, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 225.6875, + "completions/mean_terminated_length": 225.6875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.18015937175193442, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.009494007492321543, + "learning_rate": 4.881999999999999e-06, + "loss": 0.0004, + "num_tokens": 36289078.0, + "reward": 3.946481704711914, + "reward_std": 0.17710645496845245, + "rewards/reward_fn/mean": 3.946481704711914, + "rewards/reward_fn/std": 0.17710649967193604, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 406.5, + "completions/mean_terminated_length": 406.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.18027485852869846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.008590235534938984, + "learning_rate": 4.88e-06, + "loss": 0.0003, + "num_tokens": 36318182.0, + "reward": 3.862274646759033, + "reward_std": 0.5419435501098633, + "rewards/reward_fn/mean": 3.862274646759033, + "rewards/reward_fn/std": 0.5419435501098633, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 119.875, + "completions/mean_terminated_length": 119.875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.18039034530546252, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.012044955321471207, + "learning_rate": 4.878e-06, + "loss": 0.0005, + "num_tokens": 36343778.0, + "reward": 3.9671072959899902, + "reward_std": 0.186069518327713, + "rewards/reward_fn/mean": 3.9671072959899902, + "rewards/reward_fn/std": 0.186069518327713, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 95.21875, + "completions/mean_terminated_length": 95.21875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.1805058320822266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.009421510912943631, + "learning_rate": 4.876e-06, + "loss": 0.0004, + "num_tokens": 36358153.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 267.34375, + "completions/mean_terminated_length": 267.34375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.18062131885899066, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.020551851936033927, + "learning_rate": 4.873999999999999e-06, + "loss": 0.0008, + "num_tokens": 36379572.0, + "reward": 3.928919553756714, + "reward_std": 0.4020920395851135, + "rewards/reward_fn/mean": 3.928919553756714, + "rewards/reward_fn/std": 0.40209200978279114, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 261.375, + "completions/mean_terminated_length": 261.375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.1807368056357547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.009396132612891961, + "learning_rate": 4.871999999999999e-06, + "loss": 0.0004, + "num_tokens": 36403136.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 290.59375, + "completions/mean_terminated_length": 290.59375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.18085229241251877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.010716858654632233, + "learning_rate": 4.87e-06, + "loss": 0.0004, + "num_tokens": 36428819.0, + "reward": 3.63002347946167, + "reward_std": 0.6396785974502563, + "rewards/reward_fn/mean": 3.63002347946167, + "rewards/reward_fn/std": 0.6396785378456116, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 188.1875, + "completions/mean_terminated_length": 188.1875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.18096777918928283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.009881015284918249, + "learning_rate": 4.868e-06, + "loss": 0.0004, + "num_tokens": 36449017.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 99.5, + "completions/mean_terminated_length": 99.5, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.18108326596604687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.016065262083429843, + "learning_rate": 4.866e-06, + "loss": 0.0006, + "num_tokens": 36477545.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 84.9375, + "completions/mean_terminated_length": 84.9375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.18119875274281094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.008718678258446744, + "learning_rate": 4.8639999999999995e-06, + "loss": 0.0003, + "num_tokens": 36502343.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 189.5625, + "completions/mean_terminated_length": 189.5625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.181314239519575, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.008547243349312339, + "learning_rate": 4.862e-06, + "loss": 0.0003, + "num_tokens": 36527769.0, + "reward": 3.9386284351348877, + "reward_std": 0.24242247641086578, + "rewards/reward_fn/mean": 3.9386284351348877, + "rewards/reward_fn/std": 0.2424224615097046, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 119.84375, + "completions/mean_terminated_length": 119.84375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.18142972629633908, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.015031763039587531, + "learning_rate": 4.86e-06, + "loss": 0.0006, + "num_tokens": 36548628.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 233.71875, + "completions/mean_terminated_length": 233.71875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.18154521307310312, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.010615851482725702, + "learning_rate": 4.857999999999999e-06, + "loss": 0.0004, + "num_tokens": 36578251.0, + "reward": 3.3306121826171875, + "reward_std": 0.41098928451538086, + "rewards/reward_fn/mean": 3.3306121826171875, + "rewards/reward_fn/std": 0.41098925471305847, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 255.65625, + "completions/mean_terminated_length": 255.65625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.18166069984986719, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.008647599919640925, + "learning_rate": 4.856e-06, + "loss": 0.0003, + "num_tokens": 36610848.0, + "reward": 2.6769402027130127, + "reward_std": 0.05293108895421028, + "rewards/reward_fn/mean": 2.6769402027130127, + "rewards/reward_fn/std": 0.05293111130595207, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 98.78125, + "completions/mean_terminated_length": 98.78125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.18177618662663125, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.013129374819982331, + "learning_rate": 4.854e-06, + "loss": 0.0005, + "num_tokens": 36637401.0, + "reward": 3.902198553085327, + "reward_std": 0.4134228229522705, + "rewards/reward_fn/mean": 3.902198553085327, + "rewards/reward_fn/std": 0.4134228527545929, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 235.78125, + "completions/mean_terminated_length": 235.78125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.18189167340339532, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.011915650640730746, + "learning_rate": 4.852e-06, + "loss": 0.0005, + "num_tokens": 36653138.0, + "reward": 2.763887405395508, + "reward_std": 0.19655361771583557, + "rewards/reward_fn/mean": 2.763887405395508, + "rewards/reward_fn/std": 0.19655358791351318, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 300.34375, + "completions/mean_terminated_length": 300.34375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.18200716018015936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.008270016383903567, + "learning_rate": 4.849999999999999e-06, + "loss": 0.0003, + "num_tokens": 36687389.0, + "reward": 3.9394192695617676, + "reward_std": 0.2399715930223465, + "rewards/reward_fn/mean": 3.9394192695617676, + "rewards/reward_fn/std": 0.2399715632200241, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 303.96875, + "completions/mean_terminated_length": 303.96875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.18212264695692343, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.013060642886557616, + "learning_rate": 4.848e-06, + "loss": 0.0005, + "num_tokens": 36715612.0, + "reward": 3.0484678745269775, + "reward_std": 0.6924504637718201, + "rewards/reward_fn/mean": 3.0484678745269775, + "rewards/reward_fn/std": 0.6924504637718201, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 232.0625, + "completions/mean_terminated_length": 232.0625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.1822381337336875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.0077682615592493676, + "learning_rate": 4.846e-06, + "loss": 0.0003, + "num_tokens": 36737502.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 118.75, + "completions/mean_terminated_length": 118.75, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.18235362051045156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.009712208338896744, + "learning_rate": 4.844e-06, + "loss": 0.0004, + "num_tokens": 36764438.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 278.15625, + "completions/mean_terminated_length": 278.15625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.1824691072872156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.010489691616385244, + "learning_rate": 4.8419999999999996e-06, + "loss": 0.0004, + "num_tokens": 36800859.0, + "reward": 3.642336845397949, + "reward_std": 0.8444417119026184, + "rewards/reward_fn/mean": 3.642336845397949, + "rewards/reward_fn/std": 0.8444417119026184, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 115.0, + "completions/mean_terminated_length": 115.0, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.18258459406397967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.012438986916095018, + "learning_rate": 4.839999999999999e-06, + "loss": 0.0005, + "num_tokens": 36816251.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 408.1875, + "completions/mean_terminated_length": 408.1875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.18270008084074374, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.013329090492334217, + "learning_rate": 4.838e-06, + "loss": 0.0005, + "num_tokens": 36842529.0, + "reward": 3.2549967765808105, + "reward_std": 0.7076660394668579, + "rewards/reward_fn/mean": 3.2549967765808105, + "rewards/reward_fn/std": 0.7076660394668579, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 151.1875, + "completions/mean_terminated_length": 151.1875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.1828155676175078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.00970309459080454, + "learning_rate": 4.836e-06, + "loss": 0.0004, + "num_tokens": 36866151.0, + "reward": 3.7961432933807373, + "reward_std": 0.5715003609657288, + "rewards/reward_fn/mean": 3.7961432933807373, + "rewards/reward_fn/std": 0.5715004205703735, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 170.90625, + "completions/mean_terminated_length": 170.90625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.18293105439427185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.013776369043625891, + "learning_rate": 4.834e-06, + "loss": 0.0006, + "num_tokens": 36897380.0, + "reward": 3.6534552574157715, + "reward_std": 0.3892821967601776, + "rewards/reward_fn/mean": 3.6534552574157715, + "rewards/reward_fn/std": 0.3892821967601776, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 244.5625, + "completions/mean_terminated_length": 244.5625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.18304654117103591, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.011174528350238688, + "learning_rate": 4.832e-06, + "loss": 0.0004, + "num_tokens": 36918486.0, + "reward": 3.6294069290161133, + "reward_std": 0.8274264931678772, + "rewards/reward_fn/mean": 3.6294069290161133, + "rewards/reward_fn/std": 0.8274264931678772, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 244.65625, + "completions/mean_terminated_length": 244.65625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.18316202794779998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.011286249573458917, + "learning_rate": 4.8299999999999995e-06, + "loss": 0.0005, + "num_tokens": 36947883.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 247.875, + "completions/mean_terminated_length": 247.875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.18327751472456405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.009988562684156932, + "learning_rate": 4.828e-06, + "loss": 0.0004, + "num_tokens": 36971879.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 142.3125, + "completions/mean_terminated_length": 142.3125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.1833930015013281, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.01697865905589424, + "learning_rate": 4.825999999999999e-06, + "loss": 0.0007, + "num_tokens": 36991505.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 113.46875, + "completions/mean_terminated_length": 113.46875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.18350848827809216, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.008743608676013537, + "learning_rate": 4.824e-06, + "loss": 0.0003, + "num_tokens": 37019136.0, + "reward": 3.837240695953369, + "reward_std": 0.7320385575294495, + "rewards/reward_fn/mean": 3.837240695953369, + "rewards/reward_fn/std": 0.7320384979248047, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 122.65625, + "completions/mean_terminated_length": 122.65625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.18362397505485623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.006191925167513546, + "learning_rate": 4.822e-06, + "loss": 0.0002, + "num_tokens": 37037237.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 250.28125, + "completions/mean_terminated_length": 250.28125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.1837394618316203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.008340951280843, + "learning_rate": 4.8200000000000004e-06, + "loss": 0.0003, + "num_tokens": 37064958.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 175.1875, + "completions/mean_terminated_length": 175.1875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.18385494860838433, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.018926270815427415, + "learning_rate": 4.8179999999999994e-06, + "loss": 0.0008, + "num_tokens": 37091972.0, + "reward": 3.901764392852783, + "reward_std": 0.21091525256633759, + "rewards/reward_fn/mean": 3.901764392852783, + "rewards/reward_fn/std": 0.2109152376651764, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 119.09375, + "completions/mean_terminated_length": 119.09375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.1839704353851484, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.008217939161113463, + "learning_rate": 4.815999999999999e-06, + "loss": 0.0003, + "num_tokens": 37113863.0, + "reward": 3.068378448486328, + "reward_std": 0.35952532291412354, + "rewards/reward_fn/mean": 3.068378448486328, + "rewards/reward_fn/std": 0.35952526330947876, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 212.3125, + "completions/mean_terminated_length": 212.3125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.18408592216191247, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.00907947274390608, + "learning_rate": 4.814e-06, + "loss": 0.0004, + "num_tokens": 37132881.0, + "reward": 3.923145294189453, + "reward_std": 0.24286092817783356, + "rewards/reward_fn/mean": 3.923145294189453, + "rewards/reward_fn/std": 0.24286091327667236, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 101.1875, + "completions/mean_terminated_length": 101.1875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.1842014089386765, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0, + "kl": 0.009121276390942512, + "learning_rate": 4.812e-06, + "loss": 0.0004, + "num_tokens": 37153783.0, + "reward": 3.5820279121398926, + "reward_std": 0.8558840751647949, + "rewards/reward_fn/mean": 3.5820279121398926, + "rewards/reward_fn/std": 0.8558840751647949, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 308.9375, + "completions/mean_terminated_length": 308.9375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.18431689571544058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041259765625, + "kl": 0.010033524544269312, + "learning_rate": 4.81e-06, + "loss": 0.0004, + "num_tokens": 37177941.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 120.09375, + "completions/mean_terminated_length": 120.09375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.18443238249220464, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.90625, + "kl": 0.023388075744151138, + "learning_rate": 4.8079999999999995e-06, + "loss": 0.0009, + "num_tokens": 37199448.0, + "reward": 3.4506027698516846, + "reward_std": 0.6907051801681519, + "rewards/reward_fn/mean": 3.4506027698516846, + "rewards/reward_fn/std": 0.6907051801681519, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 106.03125, + "completions/mean_terminated_length": 106.03125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.1845478692689687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0390625, + "kl": 0.003951859904191224, + "learning_rate": 4.806e-06, + "loss": 0.0002, + "num_tokens": 37225241.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 69.90625, + "completions/mean_terminated_length": 69.90625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.18466335604573275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.014072313620999921, + "learning_rate": 4.804e-06, + "loss": 0.0006, + "num_tokens": 37240118.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 239.6875, + "completions/mean_terminated_length": 239.6875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.18477884282249682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.010101282852701843, + "learning_rate": 4.801999999999999e-06, + "loss": 0.0004, + "num_tokens": 37262860.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 330.03125, + "completions/mean_terminated_length": 330.03125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.1848943295992609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.011089643361628987, + "learning_rate": 4.8e-06, + "loss": 0.0004, + "num_tokens": 37289357.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 213.4375, + "completions/mean_terminated_length": 213.4375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.18500981637602495, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.01856039566337131, + "learning_rate": 4.798e-06, + "loss": 0.0007, + "num_tokens": 37307963.0, + "reward": 3.736720561981201, + "reward_std": 0.4635262191295624, + "rewards/reward_fn/mean": 3.736720561981201, + "rewards/reward_fn/std": 0.4635262191295624, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 76.5, + "completions/mean_terminated_length": 76.5, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.185125303152789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.005991295000058017, + "learning_rate": 4.796e-06, + "loss": 0.0002, + "num_tokens": 37322059.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 59.71875, + "completions/mean_terminated_length": 59.71875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.18524078992955306, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.359375, + "kl": 0.01398272420919966, + "learning_rate": 4.793999999999999e-06, + "loss": 0.0006, + "num_tokens": 37347458.0, + "reward": 3.962395668029785, + "reward_std": 0.21272209286689758, + "rewards/reward_fn/mean": 3.962395668029785, + "rewards/reward_fn/std": 0.21272209286689758, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 105.03125, + "completions/mean_terminated_length": 105.03125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.18535627670631713, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.008423094150202814, + "learning_rate": 4.791999999999999e-06, + "loss": 0.0003, + "num_tokens": 37376131.0, + "reward": 3.7921886444091797, + "reward_std": 0.6564717888832092, + "rewards/reward_fn/mean": 3.7921886444091797, + "rewards/reward_fn/std": 0.6564717888832092, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 52.78125, + "completions/mean_terminated_length": 52.78125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.1854717634830812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.004975821244443068, + "learning_rate": 4.79e-06, + "loss": 0.0002, + "num_tokens": 37392764.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 63.46875, + "completions/mean_terminated_length": 63.46875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.18558725025984524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.0073953792270913254, + "learning_rate": 4.788e-06, + "loss": 0.0003, + "num_tokens": 37411851.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 86.46875, + "completions/mean_terminated_length": 86.46875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.1857027370366093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.012580610979057383, + "learning_rate": 4.786e-06, + "loss": 0.0005, + "num_tokens": 37428250.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 66.03125, + "completions/mean_terminated_length": 66.03125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.18581822381337337, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5625, + "kl": 0.010531461441132706, + "learning_rate": 4.7839999999999994e-06, + "loss": 0.0004, + "num_tokens": 37458395.0, + "reward": 2.7346575260162354, + "reward_std": 0.07303672283887863, + "rewards/reward_fn/mean": 2.7346575260162354, + "rewards/reward_fn/std": 0.07303670793771744, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 355.96875, + "completions/mean_terminated_length": 355.96875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.18593371059013744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.010352500816225074, + "learning_rate": 4.782e-06, + "loss": 0.0004, + "num_tokens": 37486842.0, + "reward": 3.70528507232666, + "reward_std": 0.6652825474739075, + "rewards/reward_fn/mean": 3.70528507232666, + "rewards/reward_fn/std": 0.6652825474739075, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 153.53125, + "completions/mean_terminated_length": 153.53125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.18604919736690148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.011479932101792656, + "learning_rate": 4.78e-06, + "loss": 0.0005, + "num_tokens": 37514347.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 498.0625, + "completions/mean_terminated_length": 498.0625, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.18616468414366555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060791015625, + "kl": 0.015613105380907655, + "learning_rate": 4.777999999999999e-06, + "loss": 0.0006, + "num_tokens": 37545549.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 258.28125, + "completions/mean_terminated_length": 258.28125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.18628017092042962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034423828125, + "kl": 0.006963544932659715, + "learning_rate": 4.776e-06, + "loss": 0.0003, + "num_tokens": 37564630.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 206.5625, + "completions/mean_terminated_length": 206.5625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.18639565769719368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.016985502224997617, + "learning_rate": 4.7739999999999995e-06, + "loss": 0.0007, + "num_tokens": 37583368.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 185.125, + "completions/mean_terminated_length": 185.125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.18651114447395772, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.012530521475127898, + "learning_rate": 4.772e-06, + "loss": 0.0005, + "num_tokens": 37615052.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 150.78125, + "completions/mean_terminated_length": 150.78125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.1866266312507218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03369140625, + "kl": 0.0035512653485056944, + "learning_rate": 4.769999999999999e-06, + "loss": 0.0001, + "num_tokens": 37639429.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 111.4375, + "completions/mean_terminated_length": 111.4375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.18674211802748586, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.011242018248594832, + "learning_rate": 4.768e-06, + "loss": 0.0004, + "num_tokens": 37665171.0, + "reward": 3.9383721351623535, + "reward_std": 0.19475793838500977, + "rewards/reward_fn/mean": 3.9383721351623535, + "rewards/reward_fn/std": 0.19475793838500977, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 184.75, + "completions/mean_terminated_length": 184.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.18685760480424993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.009262119259801693, + "learning_rate": 4.766e-06, + "loss": 0.0004, + "num_tokens": 37693611.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 103.40625, + "completions/mean_terminated_length": 103.40625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.18697309158101397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.009003237930301111, + "learning_rate": 4.764e-06, + "loss": 0.0004, + "num_tokens": 37706552.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 193.4375, + "completions/mean_terminated_length": 193.4375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.18708857835777803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.009635347945732065, + "learning_rate": 4.7619999999999995e-06, + "loss": 0.0004, + "num_tokens": 37736454.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 300.4375, + "completions/mean_terminated_length": 300.4375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.1872040651345421, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.011544578570465092, + "learning_rate": 4.759999999999999e-06, + "loss": 0.0005, + "num_tokens": 37761876.0, + "reward": 2.859811782836914, + "reward_std": 0.15097351372241974, + "rewards/reward_fn/mean": 2.859811782836914, + "rewards/reward_fn/std": 0.15097354352474213, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 78.5625, + "completions/mean_terminated_length": 78.5625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.18731955191130614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.008745955885387957, + "learning_rate": 4.758e-06, + "loss": 0.0003, + "num_tokens": 37783462.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 81.0, + "completions/mean_terminated_length": 81.0, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.1874350386880702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0546875, + "kl": 0.005636505193251651, + "learning_rate": 4.756e-06, + "loss": 0.0002, + "num_tokens": 37804422.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 191.875, + "completions/mean_terminated_length": 191.875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.18755052546483428, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.012027585536998231, + "learning_rate": 4.754e-06, + "loss": 0.0005, + "num_tokens": 37835234.0, + "reward": 3.8981308937072754, + "reward_std": 0.4299493432044983, + "rewards/reward_fn/mean": 3.8981308937072754, + "rewards/reward_fn/std": 0.4299493134021759, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 210.0625, + "completions/mean_terminated_length": 210.0625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.18766601224159835, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.0094584518665215, + "learning_rate": 4.7519999999999996e-06, + "loss": 0.0004, + "num_tokens": 37864292.0, + "reward": 3.786707639694214, + "reward_std": 0.6737642288208008, + "rewards/reward_fn/mean": 3.786707639694214, + "rewards/reward_fn/std": 0.6737642288208008, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 105.53125, + "completions/mean_terminated_length": 105.53125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.18778149901836239, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.01042733425856568, + "learning_rate": 4.749999999999999e-06, + "loss": 0.0004, + "num_tokens": 37883477.0, + "reward": 3.1012234687805176, + "reward_std": 0.1436353623867035, + "rewards/reward_fn/mean": 3.1012234687805176, + "rewards/reward_fn/std": 0.1436353623867035, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.18789698579512645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.013221659988630563, + "learning_rate": 4.748e-06, + "loss": 0.0005, + "num_tokens": 37908081.0, + "reward": 3.9324300289154053, + "reward_std": 0.3822338581085205, + "rewards/reward_fn/mean": 3.9324300289154053, + "rewards/reward_fn/std": 0.3822338581085205, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 246.03125, + "completions/mean_terminated_length": 246.03125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.18801247257189052, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.022341390867950395, + "learning_rate": 4.746e-06, + "loss": 0.0009, + "num_tokens": 37938130.0, + "reward": 3.232950210571289, + "reward_std": 0.6443102359771729, + "rewards/reward_fn/mean": 3.232950210571289, + "rewards/reward_fn/std": 0.6443102359771729, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 254.71875, + "completions/mean_terminated_length": 254.71875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.1881279593486546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.015138364120502956, + "learning_rate": 4.744e-06, + "loss": 0.0006, + "num_tokens": 37969353.0, + "reward": 3.976722002029419, + "reward_std": 0.13168008625507355, + "rewards/reward_fn/mean": 3.976722002029419, + "rewards/reward_fn/std": 0.13168007135391235, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 158.3125, + "completions/mean_terminated_length": 158.3125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.18824344612541863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.011798932726378553, + "learning_rate": 4.742e-06, + "loss": 0.0005, + "num_tokens": 37991059.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 286.78125, + "completions/mean_terminated_length": 286.78125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.1883589329021827, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9296875, + "kl": 0.003586901286325883, + "learning_rate": 4.74e-06, + "loss": 0.0001, + "num_tokens": 38033228.0, + "reward": 3.9701144695281982, + "reward_std": 0.1690584421157837, + "rewards/reward_fn/mean": 3.9701144695281982, + "rewards/reward_fn/std": 0.16905845701694489, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 96.0, + "completions/mean_terminated_length": 96.0, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.18847441967894676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.0035004687242690125, + "learning_rate": 4.738e-06, + "loss": 0.0001, + "num_tokens": 38046188.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 103.40625, + "completions/mean_terminated_length": 103.40625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.18858990645571083, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.005733599464292638, + "learning_rate": 4.735999999999999e-06, + "loss": 0.0002, + "num_tokens": 38066585.0, + "reward": 3.1659963130950928, + "reward_std": 0.030856963247060776, + "rewards/reward_fn/mean": 3.1659963130950928, + "rewards/reward_fn/std": 0.03085697442293167, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 139.53125, + "completions/mean_terminated_length": 139.53125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.18870539323247487, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.010089051807881333, + "learning_rate": 4.734e-06, + "loss": 0.0004, + "num_tokens": 38095498.0, + "reward": 3.9758517742156982, + "reward_std": 0.1366027295589447, + "rewards/reward_fn/mean": 3.9758517742156982, + "rewards/reward_fn/std": 0.1366027444601059, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 90.6875, + "completions/mean_terminated_length": 90.6875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.18882088000923894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.0075449094492796576, + "learning_rate": 4.732e-06, + "loss": 0.0003, + "num_tokens": 38118368.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 196.90625, + "completions/mean_terminated_length": 196.90625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.188936366786003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.008584355622588191, + "learning_rate": 4.7300000000000005e-06, + "loss": 0.0003, + "num_tokens": 38147165.0, + "reward": 3.970698356628418, + "reward_std": 0.16575579345226288, + "rewards/reward_fn/mean": 3.970698356628418, + "rewards/reward_fn/std": 0.16575579345226288, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 278.34375, + "completions/mean_terminated_length": 278.34375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.18905185356276707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.011097004578914493, + "learning_rate": 4.7279999999999995e-06, + "loss": 0.0004, + "num_tokens": 38171176.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 112.40625, + "completions/mean_terminated_length": 112.40625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.18916734033953111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.01319704542402178, + "learning_rate": 4.726e-06, + "loss": 0.0005, + "num_tokens": 38187317.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 252.5, + "completions/mean_terminated_length": 252.5, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.18928282711629518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.013928113301517442, + "learning_rate": 4.724e-06, + "loss": 0.0006, + "num_tokens": 38213541.0, + "reward": 3.8004541397094727, + "reward_std": 0.5222804546356201, + "rewards/reward_fn/mean": 3.8004541397094727, + "rewards/reward_fn/std": 0.5222804546356201, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 65.65625, + "completions/mean_terminated_length": 65.65625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.18939831389305925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.005165969974768814, + "learning_rate": 4.722e-06, + "loss": 0.0002, + "num_tokens": 38232762.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 106.0, + "completions/mean_terminated_length": 106.0, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.18951380066982332, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.011204686816199683, + "learning_rate": 4.72e-06, + "loss": 0.0004, + "num_tokens": 38254170.0, + "reward": 3.77046537399292, + "reward_std": 0.44275861978530884, + "rewards/reward_fn/mean": 3.77046537399292, + "rewards/reward_fn/std": 0.44275861978530884, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 216.0625, + "completions/mean_terminated_length": 216.0625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.18962928744658736, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.011814409226644784, + "learning_rate": 4.7179999999999996e-06, + "loss": 0.0005, + "num_tokens": 38284956.0, + "reward": 3.6815624237060547, + "reward_std": 0.2403426170349121, + "rewards/reward_fn/mean": 3.6815624237060547, + "rewards/reward_fn/std": 0.2403426170349121, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 148.6875, + "completions/mean_terminated_length": 148.6875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.18974477422335143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.008241401796112768, + "learning_rate": 4.716e-06, + "loss": 0.0003, + "num_tokens": 38310354.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 188.03125, + "completions/mean_terminated_length": 188.03125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.1898602610001155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.006160647102660732, + "learning_rate": 4.714e-06, + "loss": 0.0002, + "num_tokens": 38336883.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 239.125, + "completions/mean_terminated_length": 239.125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.18997574777687956, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.009713347244542092, + "learning_rate": 4.712e-06, + "loss": 0.0004, + "num_tokens": 38363223.0, + "reward": 3.945295810699463, + "reward_std": 0.22114264965057373, + "rewards/reward_fn/mean": 3.945295810699463, + "rewards/reward_fn/std": 0.22114264965057373, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 149.1875, + "completions/mean_terminated_length": 149.1875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.1900912345536436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.013973457040265203, + "learning_rate": 4.71e-06, + "loss": 0.0006, + "num_tokens": 38384477.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 246.3125, + "completions/mean_terminated_length": 246.3125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.19020672133040767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.010767063678940758, + "learning_rate": 4.708e-06, + "loss": 0.0004, + "num_tokens": 38415463.0, + "reward": 3.561309576034546, + "reward_std": 0.5842985510826111, + "rewards/reward_fn/mean": 3.561309576034546, + "rewards/reward_fn/std": 0.5842985510826111, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 453.65625, + "completions/mean_terminated_length": 453.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.19032220810717174, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.007877733776695095, + "learning_rate": 4.706e-06, + "loss": 0.0003, + "num_tokens": 38439676.0, + "reward": 3.9312753677368164, + "reward_std": 0.38876447081565857, + "rewards/reward_fn/mean": 3.9312753677368164, + "rewards/reward_fn/std": 0.38876447081565857, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 167.03125, + "completions/mean_terminated_length": 167.03125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.19043769488393578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.011762121619540267, + "learning_rate": 4.703999999999999e-06, + "loss": 0.0005, + "num_tokens": 38463421.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 187.96875, + "completions/mean_terminated_length": 187.96875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.19055318166069984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.007442039299348835, + "learning_rate": 4.702e-06, + "loss": 0.0003, + "num_tokens": 38490684.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 95.96875, + "completions/mean_terminated_length": 95.96875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.1906686684374639, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.007128253317205235, + "learning_rate": 4.7e-06, + "loss": 0.0003, + "num_tokens": 38507131.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 295.9375, + "completions/mean_terminated_length": 295.9375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.19078415521422798, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.011452134654973634, + "learning_rate": 4.698000000000001e-06, + "loss": 0.0005, + "num_tokens": 38540569.0, + "reward": 3.338418483734131, + "reward_std": 0.4400709867477417, + "rewards/reward_fn/mean": 3.338418483734131, + "rewards/reward_fn/std": 0.4400709867477417, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 61.96875, + "completions/mean_terminated_length": 61.96875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.19089964199099202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.012063054888130864, + "learning_rate": 4.696e-06, + "loss": 0.0005, + "num_tokens": 38553144.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 126.53125, + "completions/mean_terminated_length": 126.53125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.1910151287677561, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.010344217131205369, + "learning_rate": 4.6939999999999994e-06, + "loss": 0.0004, + "num_tokens": 38581449.0, + "reward": 3.9814672470092773, + "reward_std": 0.10483647882938385, + "rewards/reward_fn/mean": 3.9814672470092773, + "rewards/reward_fn/std": 0.10483649373054504, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 134.90625, + "completions/mean_terminated_length": 134.90625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.19113061554452015, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.004720889133750461, + "learning_rate": 4.692e-06, + "loss": 0.0002, + "num_tokens": 38598438.0, + "reward": 3.9796106815338135, + "reward_std": 0.1153397485613823, + "rewards/reward_fn/mean": 3.9796106815338135, + "rewards/reward_fn/std": 0.11533977836370468, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 62.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 45.8125, + "completions/mean_terminated_length": 45.8125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.19124610232128422, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "kl": 0.011198376876563998, + "learning_rate": 4.69e-06, + "loss": 0.0004, + "num_tokens": 38610048.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 205.1875, + "completions/mean_terminated_length": 205.1875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.19136158909804826, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.013079945696517825, + "learning_rate": 4.688e-06, + "loss": 0.0005, + "num_tokens": 38624710.0, + "reward": 3.856837511062622, + "reward_std": 0.38489606976509094, + "rewards/reward_fn/mean": 3.856837511062622, + "rewards/reward_fn/std": 0.38489601016044617, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 206.375, + "completions/mean_terminated_length": 206.375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.19147707587481233, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.015447547848452814, + "learning_rate": 4.686e-06, + "loss": 0.0006, + "num_tokens": 38653938.0, + "reward": 3.3682761192321777, + "reward_std": 0.5340221524238586, + "rewards/reward_fn/mean": 3.3682761192321777, + "rewards/reward_fn/std": 0.5340221524238586, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 211.84375, + "completions/mean_terminated_length": 211.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.1915925626515764, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.008834922482492402, + "learning_rate": 4.6839999999999995e-06, + "loss": 0.0004, + "num_tokens": 38682317.0, + "reward": 2.851914882659912, + "reward_std": 0.2761073112487793, + "rewards/reward_fn/mean": 2.851914882659912, + "rewards/reward_fn/std": 0.2761073708534241, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 87.84375, + "completions/mean_terminated_length": 87.84375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.19170804942834047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.010730118374340236, + "learning_rate": 4.682e-06, + "loss": 0.0004, + "num_tokens": 38704168.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 238.8125, + "completions/mean_terminated_length": 238.8125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.1918235362051045, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.018826673593139276, + "learning_rate": 4.679999999999999e-06, + "loss": 0.0008, + "num_tokens": 38730018.0, + "reward": 3.7784371376037598, + "reward_std": 0.4932415783405304, + "rewards/reward_fn/mean": 3.7784371376037598, + "rewards/reward_fn/std": 0.4932416081428528, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 141.96875, + "completions/mean_terminated_length": 141.96875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.19193902298186857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.012440390957635827, + "learning_rate": 4.678e-06, + "loss": 0.0005, + "num_tokens": 38760577.0, + "reward": 3.8375091552734375, + "reward_std": 0.4840359687805176, + "rewards/reward_fn/mean": 3.8375091552734375, + "rewards/reward_fn/std": 0.4840359687805176, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 243.65625, + "completions/mean_terminated_length": 243.65625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.19205450975863264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.008005144867638592, + "learning_rate": 4.676e-06, + "loss": 0.0003, + "num_tokens": 38783670.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 117.8125, + "completions/mean_terminated_length": 117.8125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.1921699965353967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.010815533285494894, + "learning_rate": 4.6740000000000005e-06, + "loss": 0.0004, + "num_tokens": 38807440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 313.375, + "completions/mean_terminated_length": 313.375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.19228548331216075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0390625, + "kl": 0.009240034312824719, + "learning_rate": 4.6719999999999995e-06, + "loss": 0.0004, + "num_tokens": 38828572.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 62.375, + "completions/mean_terminated_length": 62.375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.19240097008892482, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.0068271217387518845, + "learning_rate": 4.669999999999999e-06, + "loss": 0.0003, + "num_tokens": 38847272.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 192.9375, + "completions/mean_terminated_length": 192.9375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.19251645686568888, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.009811346004426014, + "learning_rate": 4.668e-06, + "loss": 0.0004, + "num_tokens": 38865542.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 160.875, + "completions/mean_terminated_length": 160.875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.19263194364245295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.015465491713257506, + "learning_rate": 4.666e-06, + "loss": 0.0006, + "num_tokens": 38894146.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 91.8125, + "completions/mean_terminated_length": 91.8125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.192747430419217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.009904525242745876, + "learning_rate": 4.664e-06, + "loss": 0.0004, + "num_tokens": 38919484.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 158.96875, + "completions/mean_terminated_length": 158.96875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.19286291719598106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.013080784337944351, + "learning_rate": 4.662e-06, + "loss": 0.0005, + "num_tokens": 38945179.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 261.96875, + "completions/mean_terminated_length": 261.96875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.19297840397274513, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.009278323683247436, + "learning_rate": 4.66e-06, + "loss": 0.0004, + "num_tokens": 38977882.0, + "reward": 2.938621759414673, + "reward_std": 0.39720577001571655, + "rewards/reward_fn/mean": 2.938621759414673, + "rewards/reward_fn/std": 0.39720574021339417, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 250.09375, + "completions/mean_terminated_length": 250.09375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.1930938907495092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.01179001442505978, + "learning_rate": 4.658e-06, + "loss": 0.0005, + "num_tokens": 38998237.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 399.71875, + "completions/mean_terminated_length": 399.71875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.19320937752627323, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.009434336330741644, + "learning_rate": 4.655999999999999e-06, + "loss": 0.0004, + "num_tokens": 39034196.0, + "reward": 3.7800257205963135, + "reward_std": 0.5135405659675598, + "rewards/reward_fn/mean": 3.7800257205963135, + "rewards/reward_fn/std": 0.5135405659675598, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 73.03125, + "completions/mean_terminated_length": 73.03125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.1933248643030373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.007220942417916376, + "learning_rate": 4.654e-06, + "loss": 0.0003, + "num_tokens": 39054517.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 189.03125, + "completions/mean_terminated_length": 189.03125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.19344035107980137, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.0094090250422596, + "learning_rate": 4.652e-06, + "loss": 0.0004, + "num_tokens": 39068630.0, + "reward": 3.9294981956481934, + "reward_std": 0.3988178074359894, + "rewards/reward_fn/mean": 3.9294981956481934, + "rewards/reward_fn/std": 0.39881783723831177, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 267.28125, + "completions/mean_terminated_length": 267.28125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.1935558378565654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.012592573868460022, + "learning_rate": 4.65e-06, + "loss": 0.0005, + "num_tokens": 39100223.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 290.375, + "completions/mean_terminated_length": 290.375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.19367132463332948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.011582240520510823, + "learning_rate": 4.647999999999999e-06, + "loss": 0.0005, + "num_tokens": 39121963.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 98.46875, + "completions/mean_terminated_length": 98.46875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.19378681141009355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.006412502749299165, + "learning_rate": 4.646e-06, + "loss": 0.0003, + "num_tokens": 39140186.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 132.65625, + "completions/mean_terminated_length": 132.65625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.1939022981868576, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.859375, + "kl": 0.017951958419871517, + "learning_rate": 4.644e-06, + "loss": 0.0007, + "num_tokens": 39165295.0, + "reward": 3.976635456085205, + "reward_std": 0.1321706473827362, + "rewards/reward_fn/mean": 3.976635456085205, + "rewards/reward_fn/std": 0.1321706622838974, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 247.3125, + "completions/mean_terminated_length": 247.3125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.19401778496362165, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.014864929442410357, + "learning_rate": 4.642e-06, + "loss": 0.0006, + "num_tokens": 39194265.0, + "reward": 3.6584548950195312, + "reward_std": 0.4908693730831146, + "rewards/reward_fn/mean": 3.6584548950195312, + "rewards/reward_fn/std": 0.49086934328079224, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 259.0625, + "completions/mean_terminated_length": 259.0625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.19413327174038572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.01942346093710512, + "learning_rate": 4.64e-06, + "loss": 0.0008, + "num_tokens": 39219835.0, + "reward": 3.4260449409484863, + "reward_std": 0.45182111859321594, + "rewards/reward_fn/mean": 3.4260449409484863, + "rewards/reward_fn/std": 0.45182114839553833, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 325.40625, + "completions/mean_terminated_length": 325.40625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.1942487585171498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.012974447585293092, + "learning_rate": 4.6379999999999995e-06, + "loss": 0.0005, + "num_tokens": 39250248.0, + "reward": 2.479489326477051, + "reward_std": 0.9552228450775146, + "rewards/reward_fn/mean": 2.479489326477051, + "rewards/reward_fn/std": 0.9552227854728699, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 170.71875, + "completions/mean_terminated_length": 170.71875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.19436424529391386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.00955387533758767, + "learning_rate": 4.636e-06, + "loss": 0.0004, + "num_tokens": 39274559.0, + "reward": 3.9108638763427734, + "reward_std": 0.2407812774181366, + "rewards/reward_fn/mean": 3.9108638763427734, + "rewards/reward_fn/std": 0.2407812923192978, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 312.78125, + "completions/mean_terminated_length": 256.80645751953125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.1944797320706779, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48046875, + "kl": 0.009865029918728396, + "learning_rate": 4.634e-06, + "loss": 0.0004, + "num_tokens": 39299640.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 241.65625, + "completions/mean_terminated_length": 241.65625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.19459521884744196, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.020409056465723552, + "learning_rate": 4.632e-06, + "loss": 0.0008, + "num_tokens": 39320365.0, + "reward": 2.5956616401672363, + "reward_std": 0.6818588972091675, + "rewards/reward_fn/mean": 2.5956616401672363, + "rewards/reward_fn/std": 0.6818588972091675, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 265.90625, + "completions/mean_terminated_length": 265.90625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.19471070562420603, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.009287423687055707, + "learning_rate": 4.63e-06, + "loss": 0.0004, + "num_tokens": 39352330.0, + "reward": 3.9303946495056152, + "reward_std": 0.39374756813049316, + "rewards/reward_fn/mean": 3.9303946495056152, + "rewards/reward_fn/std": 0.39374756813049316, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 112.53125, + "completions/mean_terminated_length": 112.53125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.1948261924009701, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.015550668351352215, + "learning_rate": 4.6279999999999996e-06, + "loss": 0.0006, + "num_tokens": 39380443.0, + "reward": 3.8290185928344727, + "reward_std": 0.5625805258750916, + "rewards/reward_fn/mean": 3.8290185928344727, + "rewards/reward_fn/std": 0.5625804662704468, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 140.25, + "completions/mean_terminated_length": 140.25, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.19494167917773414, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.011306048261758406, + "learning_rate": 4.626e-06, + "loss": 0.0005, + "num_tokens": 39392995.0, + "reward": 3.9790725708007812, + "reward_std": 0.1183832436800003, + "rewards/reward_fn/mean": 3.9790725708007812, + "rewards/reward_fn/std": 0.11838320642709732, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 84.125, + "completions/mean_terminated_length": 84.125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.1950571659544982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.008913427056540968, + "learning_rate": 4.623999999999999e-06, + "loss": 0.0004, + "num_tokens": 39407911.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 106.5, + "completions/mean_terminated_length": 106.5, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.19517265273126227, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.005474380643136101, + "learning_rate": 4.622e-06, + "loss": 0.0002, + "num_tokens": 39428791.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 93.625, + "completions/mean_terminated_length": 93.625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.19528813950802634, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.019545594361261465, + "learning_rate": 4.62e-06, + "loss": 0.0008, + "num_tokens": 39455595.0, + "reward": 3.1597297191619873, + "reward_std": 0.033942658454179764, + "rewards/reward_fn/mean": 3.1597297191619873, + "rewards/reward_fn/std": 0.03394269198179245, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 152.5, + "completions/mean_terminated_length": 152.5, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.19540362628479038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.007988013974681962, + "learning_rate": 4.6180000000000005e-06, + "loss": 0.0003, + "num_tokens": 39484507.0, + "reward": 3.7731995582580566, + "reward_std": 0.4359476864337921, + "rewards/reward_fn/mean": 3.7731995582580566, + "rewards/reward_fn/std": 0.4359476864337921, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 80.4375, + "completions/mean_terminated_length": 80.4375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.19551911306155445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.006565982010215521, + "learning_rate": 4.6159999999999995e-06, + "loss": 0.0003, + "num_tokens": 39499177.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.19563459983831852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.008720827485376503, + "learning_rate": 4.613999999999999e-06, + "loss": 0.0004, + "num_tokens": 39532127.0, + "reward": 2.7237660884857178, + "reward_std": 0.03567052632570267, + "rewards/reward_fn/mean": 2.7237660884857178, + "rewards/reward_fn/std": 0.03567051514983177, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 101.4375, + "completions/mean_terminated_length": 101.4375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.19575008661508259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.013054751594609115, + "learning_rate": 4.612e-06, + "loss": 0.0005, + "num_tokens": 39545005.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 154.15625, + "completions/mean_terminated_length": 154.15625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.19586557339184663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.018702979461522773, + "learning_rate": 4.61e-06, + "loss": 0.0007, + "num_tokens": 39575826.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 172.375, + "completions/mean_terminated_length": 172.375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.1959810601686107, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.0075417021798784845, + "learning_rate": 4.608e-06, + "loss": 0.0003, + "num_tokens": 39595806.0, + "reward": 2.703219175338745, + "reward_std": 0.05078702047467232, + "rewards/reward_fn/mean": 2.703219175338745, + "rewards/reward_fn/std": 0.050786975771188736, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 183.84375, + "completions/mean_terminated_length": 183.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.19609654694537476, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.016609056081506424, + "learning_rate": 4.606e-06, + "loss": 0.0007, + "num_tokens": 39624729.0, + "reward": 3.909058094024658, + "reward_std": 0.24497300386428833, + "rewards/reward_fn/mean": 3.909058094024658, + "rewards/reward_fn/std": 0.24497301876544952, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 145.46875, + "completions/mean_terminated_length": 145.46875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.19621203372213883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.01379128682310693, + "learning_rate": 4.604e-06, + "loss": 0.0006, + "num_tokens": 39641608.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 392.9375, + "completions/mean_terminated_length": 392.9375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.19632752049890287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03857421875, + "kl": 0.00791451505938312, + "learning_rate": 4.602e-06, + "loss": 0.0003, + "num_tokens": 39669638.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 255.65625, + "completions/mean_terminated_length": 255.65625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.19644300727566694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037841796875, + "kl": 0.00745667249429971, + "learning_rate": 4.599999999999999e-06, + "loss": 0.0003, + "num_tokens": 39693339.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 105.4375, + "completions/mean_terminated_length": 105.4375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.196558494052431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.008616899824119173, + "learning_rate": 4.598e-06, + "loss": 0.0003, + "num_tokens": 39710921.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 331.75, + "completions/mean_terminated_length": 331.75, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.19667398082919504, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.0108345462649595, + "learning_rate": 4.596e-06, + "loss": 0.0004, + "num_tokens": 39731873.0, + "reward": 3.9303641319274902, + "reward_std": 0.3939204812049866, + "rewards/reward_fn/mean": 3.9303641319274902, + "rewards/reward_fn/std": 0.3939204514026642, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 296.09375, + "completions/mean_terminated_length": 296.09375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.1967894676059591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.009164901064650621, + "learning_rate": 4.594e-06, + "loss": 0.0004, + "num_tokens": 39756484.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 348.46875, + "completions/mean_terminated_length": 348.46875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.19690495438272318, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.013992953303386457, + "learning_rate": 4.591999999999999e-06, + "loss": 0.0006, + "num_tokens": 39776147.0, + "reward": 3.068485736846924, + "reward_std": 0.4783761501312256, + "rewards/reward_fn/mean": 3.068485736846924, + "rewards/reward_fn/std": 0.4783761203289032, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 64.1875, + "completions/mean_terminated_length": 64.1875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.19702044115948725, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.010889487064559944, + "learning_rate": 4.589999999999999e-06, + "loss": 0.0004, + "num_tokens": 39797209.0, + "reward": 3.932713508605957, + "reward_std": 0.38062968850135803, + "rewards/reward_fn/mean": 3.932713508605957, + "rewards/reward_fn/std": 0.38062968850135803, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 67.21875, + "completions/mean_terminated_length": 67.21875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.1971359279362513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.007628156734426739, + "learning_rate": 4.588e-06, + "loss": 0.0003, + "num_tokens": 39811104.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.19725141471301536, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.016256475108093582, + "learning_rate": 4.586e-06, + "loss": 0.0007, + "num_tokens": 39844358.0, + "reward": 3.5059056282043457, + "reward_std": 0.44860923290252686, + "rewards/reward_fn/mean": 3.5059056282043457, + "rewards/reward_fn/std": 0.44860923290252686, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 220.3125, + "completions/mean_terminated_length": 220.3125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.19736690148977942, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.019368950772332028, + "learning_rate": 4.584e-06, + "loss": 0.0008, + "num_tokens": 39863760.0, + "reward": 3.9790961742401123, + "reward_std": 0.11825019866228104, + "rewards/reward_fn/mean": 3.9790961742401123, + "rewards/reward_fn/std": 0.11825019121170044, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 118.84375, + "completions/mean_terminated_length": 118.84375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.1974823882665435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.007821464434528025, + "learning_rate": 4.5819999999999995e-06, + "loss": 0.0003, + "num_tokens": 39879147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 74.46875, + "completions/mean_terminated_length": 74.46875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.19759787504330753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.007065602458169451, + "learning_rate": 4.58e-06, + "loss": 0.0003, + "num_tokens": 39892890.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 136.71875, + "completions/mean_terminated_length": 136.71875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.1977133618200716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.013178896275348961, + "learning_rate": 4.578e-06, + "loss": 0.0005, + "num_tokens": 39919921.0, + "reward": 3.930579423904419, + "reward_std": 0.3927015960216522, + "rewards/reward_fn/mean": 3.930579423904419, + "rewards/reward_fn/std": 0.3927016258239746, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 61.84375, + "completions/mean_terminated_length": 61.84375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.19782884859683567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.427734375, + "kl": 0.022776418350986205, + "learning_rate": 4.575999999999999e-06, + "loss": 0.0009, + "num_tokens": 39933676.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 187.34375, + "completions/mean_terminated_length": 187.34375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.19794433537359973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.01421147356450092, + "learning_rate": 4.574e-06, + "loss": 0.0006, + "num_tokens": 39955159.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 203.96875, + "completions/mean_terminated_length": 203.96875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.19805982215036377, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.017559557309141383, + "learning_rate": 4.572e-06, + "loss": 0.0007, + "num_tokens": 39983990.0, + "reward": 3.6870884895324707, + "reward_std": 0.5279752016067505, + "rewards/reward_fn/mean": 3.6870884895324707, + "rewards/reward_fn/std": 0.5279752612113953, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 124.34375, + "completions/mean_terminated_length": 124.34375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.19817530892712784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.00991223910386907, + "learning_rate": 4.57e-06, + "loss": 0.0004, + "num_tokens": 39999233.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 324.03125, + "completions/mean_terminated_length": 324.03125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.1982907957038919, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.012308815101278014, + "learning_rate": 4.567999999999999e-06, + "loss": 0.0005, + "num_tokens": 40021762.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 119.25, + "completions/mean_terminated_length": 119.25, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.19840628248065598, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.011546327303221915, + "learning_rate": 4.566e-06, + "loss": 0.0005, + "num_tokens": 40050026.0, + "reward": 3.929140329360962, + "reward_std": 0.4008429944515228, + "rewards/reward_fn/mean": 3.929140329360962, + "rewards/reward_fn/std": 0.4008430242538452, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 228.21875, + "completions/mean_terminated_length": 228.21875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.19852176925742002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.01028909848537296, + "learning_rate": 4.564e-06, + "loss": 0.0004, + "num_tokens": 40065201.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 182.4375, + "completions/mean_terminated_length": 182.4375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.19863725603418408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.008854106374201365, + "learning_rate": 4.562e-06, + "loss": 0.0004, + "num_tokens": 40083487.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 81.71875, + "completions/mean_terminated_length": 81.71875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.19875274281094815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1572265625, + "kl": 0.016858231130754575, + "learning_rate": 4.5599999999999995e-06, + "loss": 0.0007, + "num_tokens": 40107734.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 208.59375, + "completions/mean_terminated_length": 208.59375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.19886822958771222, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.011594668714678846, + "learning_rate": 4.557999999999999e-06, + "loss": 0.0005, + "num_tokens": 40138665.0, + "reward": 2.947871685028076, + "reward_std": 0.39560484886169434, + "rewards/reward_fn/mean": 2.947871685028076, + "rewards/reward_fn/std": 0.39560484886169434, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 96.84375, + "completions/mean_terminated_length": 96.84375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.19898371636447626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.009365578596771229, + "learning_rate": 4.556e-06, + "loss": 0.0004, + "num_tokens": 40153796.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 606.40625, + "completions/mean_terminated_length": 457.2758483886719, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.19909920314124033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.010521152402361622, + "learning_rate": 4.554e-06, + "loss": 0.0004, + "num_tokens": 40181681.0, + "reward": 3.286651134490967, + "reward_std": 1.1426488161087036, + "rewards/reward_fn/mean": 3.286651134490967, + "rewards/reward_fn/std": 1.1426488161087036, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 94.09375, + "completions/mean_terminated_length": 94.09375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.1992146899180044, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.01361971520964289, + "learning_rate": 4.552e-06, + "loss": 0.0005, + "num_tokens": 40198196.0, + "reward": 3.5791895389556885, + "reward_std": 0.05553041398525238, + "rewards/reward_fn/mean": 3.5791895389556885, + "rewards/reward_fn/std": 0.055530399084091187, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 329.28125, + "completions/mean_terminated_length": 329.28125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.19933017669476846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.00827454226964619, + "learning_rate": 4.55e-06, + "loss": 0.0003, + "num_tokens": 40227933.0, + "reward": 3.849684953689575, + "reward_std": 0.36480140686035156, + "rewards/reward_fn/mean": 3.849684953689575, + "rewards/reward_fn/std": 0.36480143666267395, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 165.6875, + "completions/mean_terminated_length": 165.6875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.1994456634715325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.01440770061162766, + "learning_rate": 4.5479999999999995e-06, + "loss": 0.0006, + "num_tokens": 40248243.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 161.71875, + "completions/mean_terminated_length": 161.71875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.19956115024829657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.008532635241863318, + "learning_rate": 4.546e-06, + "loss": 0.0003, + "num_tokens": 40267306.0, + "reward": 2.851757526397705, + "reward_std": 0.03492879867553711, + "rewards/reward_fn/mean": 2.851757526397705, + "rewards/reward_fn/std": 0.03492877632379532, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 354.96875, + "completions/mean_terminated_length": 354.96875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.19967663702506064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051513671875, + "kl": 0.01077517586236354, + "learning_rate": 4.543999999999999e-06, + "loss": 0.0004, + "num_tokens": 40293865.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 174.65625, + "completions/mean_terminated_length": 174.65625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.19979212380182468, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.01602821312553715, + "learning_rate": 4.542e-06, + "loss": 0.0006, + "num_tokens": 40322782.0, + "reward": 3.6446897983551025, + "reward_std": 0.8389098048210144, + "rewards/reward_fn/mean": 3.6446897983551025, + "rewards/reward_fn/std": 0.8389098048210144, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 209.46875, + "completions/mean_terminated_length": 209.46875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.19990761057858875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.013973949317005463, + "learning_rate": 4.54e-06, + "loss": 0.0006, + "num_tokens": 40353901.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 47.5625, + "completions/mean_terminated_length": 47.5625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.2000230973553528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.46484375, + "kl": 0.026975318614859134, + "learning_rate": 4.5380000000000004e-06, + "loss": 0.0011, + "num_tokens": 40364735.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 96.0625, + "completions/mean_terminated_length": 96.0625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.20013858413211688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.00981252727797255, + "learning_rate": 4.5359999999999994e-06, + "loss": 0.0004, + "num_tokens": 40385953.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 286.3125, + "completions/mean_terminated_length": 286.3125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.20025407090888092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.013447711840854026, + "learning_rate": 4.533999999999999e-06, + "loss": 0.0005, + "num_tokens": 40406123.0, + "reward": 3.777846097946167, + "reward_std": 0.4272802770137787, + "rewards/reward_fn/mean": 3.777846097946167, + "rewards/reward_fn/std": 0.4272802472114563, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 353.15625, + "completions/mean_terminated_length": 353.15625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.200369557685645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04345703125, + "kl": 0.009938408067682758, + "learning_rate": 4.532e-06, + "loss": 0.0004, + "num_tokens": 40427024.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 211.59375, + "completions/mean_terminated_length": 211.59375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.20048504446240906, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.018339305563131347, + "learning_rate": 4.53e-06, + "loss": 0.0007, + "num_tokens": 40452867.0, + "reward": 3.028228998184204, + "reward_std": 0.0608033649623394, + "rewards/reward_fn/mean": 3.028228998184204, + "rewards/reward_fn/std": 0.060803357511758804, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 50.46875, + "completions/mean_terminated_length": 50.46875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.20060053123917312, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5625, + "kl": 0.013954140878922772, + "learning_rate": 4.528e-06, + "loss": 0.0006, + "num_tokens": 40466674.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 139.9375, + "completions/mean_terminated_length": 139.9375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.20071601801593716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.019114666065433994, + "learning_rate": 4.5259999999999995e-06, + "loss": 0.0008, + "num_tokens": 40495120.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 66.0, + "completions/max_terminated_length": 66.0, + "completions/mean_length": 55.375, + "completions/mean_terminated_length": 55.375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.20083150479270123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.01101571290200809, + "learning_rate": 4.524e-06, + "loss": 0.0004, + "num_tokens": 40516572.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 371.65625, + "completions/mean_terminated_length": 317.58062744140625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.2009469915694653, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.014767964152269997, + "learning_rate": 4.522e-06, + "loss": 0.0006, + "num_tokens": 40540849.0, + "reward": 3.804234027862549, + "reward_std": 0.8012415766716003, + "rewards/reward_fn/mean": 3.804234027862549, + "rewards/reward_fn/std": 0.8012415170669556, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 153.90625, + "completions/mean_terminated_length": 153.90625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.20106247834622937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.0113297974166926, + "learning_rate": 4.519999999999999e-06, + "loss": 0.0005, + "num_tokens": 40570286.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 185.25, + "completions/mean_terminated_length": 185.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.2011779651229934, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.009533474025374744, + "learning_rate": 4.518e-06, + "loss": 0.0004, + "num_tokens": 40601014.0, + "reward": 3.92655086517334, + "reward_std": 0.415490984916687, + "rewards/reward_fn/mean": 3.92655086517334, + "rewards/reward_fn/std": 0.415490984916687, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 222.28125, + "completions/mean_terminated_length": 222.28125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.20129345189975748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.012710196155239828, + "learning_rate": 4.516e-06, + "loss": 0.0005, + "num_tokens": 40627231.0, + "reward": 3.8727540969848633, + "reward_std": 0.3029966652393341, + "rewards/reward_fn/mean": 3.8727540969848633, + "rewards/reward_fn/std": 0.3029966652393341, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 107.53125, + "completions/mean_terminated_length": 107.53125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.20140893867652154, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1826171875, + "kl": 0.02431287826038897, + "learning_rate": 4.514e-06, + "loss": 0.001, + "num_tokens": 40643568.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 127.34375, + "completions/mean_terminated_length": 127.34375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.2015244254532856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.009745156283315737, + "learning_rate": 4.511999999999999e-06, + "loss": 0.0004, + "num_tokens": 40672571.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.20163991223004965, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.011559743681573309, + "learning_rate": 4.509999999999999e-06, + "loss": 0.0005, + "num_tokens": 40693390.0, + "reward": 3.926909923553467, + "reward_std": 0.4134599268436432, + "rewards/reward_fn/mean": 3.926909923553467, + "rewards/reward_fn/std": 0.4134599566459656, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 277.8125, + "completions/mean_terminated_length": 277.8125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.20175539900681372, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.010414123142254539, + "learning_rate": 4.508e-06, + "loss": 0.0004, + "num_tokens": 40714472.0, + "reward": 3.971217393875122, + "reward_std": 0.16281861066818237, + "rewards/reward_fn/mean": 3.971217393875122, + "rewards/reward_fn/std": 0.16281858086585999, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 185.84375, + "completions/mean_terminated_length": 185.84375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.20187088578357779, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.015338629527832381, + "learning_rate": 4.506e-06, + "loss": 0.0006, + "num_tokens": 40741923.0, + "reward": 2.994335174560547, + "reward_std": 0.028866469860076904, + "rewards/reward_fn/mean": 2.994335174560547, + "rewards/reward_fn/std": 0.028866475448012352, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 80.4375, + "completions/mean_terminated_length": 80.4375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.20198637256034185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.008143202401697636, + "learning_rate": 4.5039999999999996e-06, + "loss": 0.0003, + "num_tokens": 40756625.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 160.96875, + "completions/mean_terminated_length": 160.96875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2021018593371059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.009222642627719324, + "learning_rate": 4.501999999999999e-06, + "loss": 0.0004, + "num_tokens": 40777104.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 138.96875, + "completions/mean_terminated_length": 138.96875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.20221734611386996, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.008327256451593712, + "learning_rate": 4.5e-06, + "loss": 0.0003, + "num_tokens": 40795855.0, + "reward": 3.93027400970459, + "reward_std": 0.39442893862724304, + "rewards/reward_fn/mean": 3.93027400970459, + "rewards/reward_fn/std": 0.39442890882492065, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 175.375, + "completions/mean_terminated_length": 175.375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.20233283289063403, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.011756123261875473, + "learning_rate": 4.498e-06, + "loss": 0.0005, + "num_tokens": 40813947.0, + "reward": 3.364011764526367, + "reward_std": 0.5720310807228088, + "rewards/reward_fn/mean": 3.364011764526367, + "rewards/reward_fn/std": 0.5720310807228088, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 90.21875, + "completions/mean_terminated_length": 90.21875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.2024483196673981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.0219358221947914, + "learning_rate": 4.496000000000001e-06, + "loss": 0.0009, + "num_tokens": 40838466.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 205.84375, + "completions/mean_terminated_length": 205.84375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.20256380644416214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1943359375, + "kl": 0.013897361030103639, + "learning_rate": 4.494e-06, + "loss": 0.0006, + "num_tokens": 40868317.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 79.40625, + "completions/mean_terminated_length": 79.40625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.2026792932209262, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.009975345004932024, + "learning_rate": 4.4919999999999995e-06, + "loss": 0.0004, + "num_tokens": 40893098.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 193.0, + "completions/mean_terminated_length": 193.0, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.20279477999769027, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.013585241336841136, + "learning_rate": 4.49e-06, + "loss": 0.0005, + "num_tokens": 40921866.0, + "reward": 3.580397367477417, + "reward_std": 0.39153099060058594, + "rewards/reward_fn/mean": 3.580397367477417, + "rewards/reward_fn/std": 0.39153096079826355, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 90.59375, + "completions/mean_terminated_length": 90.59375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.2029102667744543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.00924819663487142, + "learning_rate": 4.488e-06, + "loss": 0.0004, + "num_tokens": 40936925.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 113.0, + "completions/mean_terminated_length": 113.0, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.20302575355121838, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "kl": 0.029490193439414725, + "learning_rate": 4.486e-06, + "loss": 0.0012, + "num_tokens": 40966333.0, + "reward": 3.551300525665283, + "reward_std": 0.3953414559364319, + "rewards/reward_fn/mean": 3.551300525665283, + "rewards/reward_fn/std": 0.3953414261341095, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 145.34375, + "completions/mean_terminated_length": 145.34375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.20314124032798245, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.013390524880378507, + "learning_rate": 4.484e-06, + "loss": 0.0005, + "num_tokens": 40993160.0, + "reward": 3.9283719062805176, + "reward_std": 0.28208836913108826, + "rewards/reward_fn/mean": 3.9283719062805176, + "rewards/reward_fn/std": 0.28208836913108826, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 259.0625, + "completions/mean_terminated_length": 259.0625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.20325672710474652, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.015164164367888588, + "learning_rate": 4.482e-06, + "loss": 0.0006, + "num_tokens": 41025418.0, + "reward": 3.4767298698425293, + "reward_std": 0.7140443325042725, + "rewards/reward_fn/mean": 3.4767298698425293, + "rewards/reward_fn/std": 0.7140443325042725, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 239.21875, + "completions/mean_terminated_length": 239.21875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.20337221388151056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.009565149564878084, + "learning_rate": 4.48e-06, + "loss": 0.0004, + "num_tokens": 41058129.0, + "reward": 3.7953624725341797, + "reward_std": 0.6464908719062805, + "rewards/reward_fn/mean": 3.7953624725341797, + "rewards/reward_fn/std": 0.6464908719062805, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 130.375, + "completions/mean_terminated_length": 130.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.20348770065827462, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.011680082665407099, + "learning_rate": 4.477999999999999e-06, + "loss": 0.0005, + "num_tokens": 41074557.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 389.90625, + "completions/mean_terminated_length": 389.90625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.2036031874350387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041259765625, + "kl": 0.010076979626319371, + "learning_rate": 4.476e-06, + "loss": 0.0004, + "num_tokens": 41105946.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 70.0625, + "completions/mean_terminated_length": 70.0625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.20371867421180276, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.46875, + "kl": 0.008883449976565316, + "learning_rate": 4.474e-06, + "loss": 0.0004, + "num_tokens": 41120700.0, + "reward": 2.813145875930786, + "reward_std": 0.03342433646321297, + "rewards/reward_fn/mean": 2.813145875930786, + "rewards/reward_fn/std": 0.033424295485019684, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 111.3125, + "completions/mean_terminated_length": 111.3125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.2038341609885668, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.010225848287518602, + "learning_rate": 4.4720000000000006e-06, + "loss": 0.0004, + "num_tokens": 41141958.0, + "reward": 3.959912061691284, + "reward_std": 0.15778391063213348, + "rewards/reward_fn/mean": 3.959912061691284, + "rewards/reward_fn/std": 0.15778392553329468, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 138.5625, + "completions/mean_terminated_length": 138.5625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.20394964776533087, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.008672191856021527, + "learning_rate": 4.4699999999999996e-06, + "loss": 0.0003, + "num_tokens": 41157368.0, + "reward": 3.861234188079834, + "reward_std": 0.5460375547409058, + "rewards/reward_fn/mean": 3.861234188079834, + "rewards/reward_fn/std": 0.5460375547409058, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 180.78125, + "completions/mean_terminated_length": 180.78125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.20406513454209493, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.0181983144429978, + "learning_rate": 4.467999999999999e-06, + "loss": 0.0007, + "num_tokens": 41175505.0, + "reward": 3.9443116188049316, + "reward_std": 0.22011412680149078, + "rewards/reward_fn/mean": 3.9443116188049316, + "rewards/reward_fn/std": 0.22011415660381317, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 74.65625, + "completions/mean_terminated_length": 74.65625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.204180621318859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.011379012241377495, + "learning_rate": 4.466e-06, + "loss": 0.0005, + "num_tokens": 41200934.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 175.375, + "completions/mean_terminated_length": 175.375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.20429610809562304, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.025157734635286033, + "learning_rate": 4.464e-06, + "loss": 0.001, + "num_tokens": 41228978.0, + "reward": 3.8199920654296875, + "reward_std": 0.3807898163795471, + "rewards/reward_fn/mean": 3.8199920654296875, + "rewards/reward_fn/std": 0.3807898461818695, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 177.5625, + "completions/mean_terminated_length": 177.5625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.2044115948723871, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.007831717011868022, + "learning_rate": 4.462e-06, + "loss": 0.0003, + "num_tokens": 41254500.0, + "reward": 3.9043688774108887, + "reward_std": 0.41713622212409973, + "rewards/reward_fn/mean": 3.9043688774108887, + "rewards/reward_fn/std": 0.41713616251945496, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 139.25, + "completions/mean_terminated_length": 139.25, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.20452708164915118, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.015229332362650894, + "learning_rate": 4.46e-06, + "loss": 0.0006, + "num_tokens": 41274732.0, + "reward": 3.5761866569519043, + "reward_std": 0.5565059781074524, + "rewards/reward_fn/mean": 3.5761866569519043, + "rewards/reward_fn/std": 0.5565059185028076, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 236.3125, + "completions/mean_terminated_length": 236.3125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.20464256842591524, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.01766483034589328, + "learning_rate": 4.458e-06, + "loss": 0.0007, + "num_tokens": 41302678.0, + "reward": 3.7543303966522217, + "reward_std": 0.586068332195282, + "rewards/reward_fn/mean": 3.7543303966522217, + "rewards/reward_fn/std": 0.5860682725906372, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 177.1875, + "completions/mean_terminated_length": 177.1875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.20475805520267928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05859375, + "kl": 0.009468365053180605, + "learning_rate": 4.456e-06, + "loss": 0.0004, + "num_tokens": 41329820.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 236.09375, + "completions/mean_terminated_length": 236.09375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.20487354197944335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.011343797406880185, + "learning_rate": 4.453999999999999e-06, + "loss": 0.0005, + "num_tokens": 41347263.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 213.34375, + "completions/mean_terminated_length": 213.34375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.20498902875620742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.013903156141168438, + "learning_rate": 4.452e-06, + "loss": 0.0006, + "num_tokens": 41369866.0, + "reward": 3.374457836151123, + "reward_std": 0.5271715521812439, + "rewards/reward_fn/mean": 3.374457836151123, + "rewards/reward_fn/std": 0.5271715521812439, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 233.75, + "completions/mean_terminated_length": 233.75, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.2051045155329715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.01101592607301427, + "learning_rate": 4.45e-06, + "loss": 0.0004, + "num_tokens": 41397282.0, + "reward": 3.3454723358154297, + "reward_std": 0.3590305745601654, + "rewards/reward_fn/mean": 3.3454723358154297, + "rewards/reward_fn/std": 0.359030544757843, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 60.0625, + "completions/mean_terminated_length": 60.0625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.20522000230973553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.01537142887536902, + "learning_rate": 4.4480000000000004e-06, + "loss": 0.0006, + "num_tokens": 41423108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 107.5625, + "completions/mean_terminated_length": 107.5625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.2053354890864996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.013462342190905474, + "learning_rate": 4.4459999999999994e-06, + "loss": 0.0005, + "num_tokens": 41443094.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 57.84375, + "completions/mean_terminated_length": 57.84375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.20545097586326366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796875, + "kl": 0.013408283455646597, + "learning_rate": 4.444e-06, + "loss": 0.0005, + "num_tokens": 41465297.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 185.53125, + "completions/mean_terminated_length": 185.53125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.20556646264002773, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.013461425784043968, + "learning_rate": 4.442e-06, + "loss": 0.0005, + "num_tokens": 41484194.0, + "reward": 3.929906129837036, + "reward_std": 0.39651069045066833, + "rewards/reward_fn/mean": 3.929906129837036, + "rewards/reward_fn/std": 0.39651066064834595, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 202.84375, + "completions/mean_terminated_length": 202.84375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.20568194941679177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.212890625, + "kl": 0.02222769719082862, + "learning_rate": 4.44e-06, + "loss": 0.0009, + "num_tokens": 41508829.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 102.78125, + "completions/mean_terminated_length": 102.78125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.20579743619355584, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.008311425750434864, + "learning_rate": 4.438e-06, + "loss": 0.0003, + "num_tokens": 41537494.0, + "reward": 3.467515468597412, + "reward_std": 0.21439607441425323, + "rewards/reward_fn/mean": 3.467515468597412, + "rewards/reward_fn/std": 0.21439604461193085, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 95.84375, + "completions/mean_terminated_length": 95.84375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.2059129229703199, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.006119564977780101, + "learning_rate": 4.4359999999999995e-06, + "loss": 0.0002, + "num_tokens": 41548721.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 138.53125, + "completions/mean_terminated_length": 138.53125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.20602840974708395, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.012711549177765846, + "learning_rate": 4.434e-06, + "loss": 0.0005, + "num_tokens": 41569954.0, + "reward": 3.723940134048462, + "reward_std": 0.41179975867271423, + "rewards/reward_fn/mean": 3.723940134048462, + "rewards/reward_fn/std": 0.41179972887039185, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 136.875, + "completions/mean_terminated_length": 136.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.206143896523848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.009174352046102285, + "learning_rate": 4.432e-06, + "loss": 0.0004, + "num_tokens": 41592350.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 91.625, + "completions/mean_terminated_length": 91.625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.20625938330061208, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.016916113236220554, + "learning_rate": 4.43e-06, + "loss": 0.0007, + "num_tokens": 41604754.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 142.5625, + "completions/mean_terminated_length": 142.5625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.20637487007737615, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "kl": 0.017796236235881224, + "learning_rate": 4.428e-06, + "loss": 0.0007, + "num_tokens": 41625060.0, + "reward": 2.909127712249756, + "reward_std": 0.06329182535409927, + "rewards/reward_fn/mean": 2.909127712249756, + "rewards/reward_fn/std": 0.0632917657494545, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 90.9375, + "completions/mean_terminated_length": 90.9375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.2064903568541402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.009260798189643538, + "learning_rate": 4.426e-06, + "loss": 0.0004, + "num_tokens": 41649858.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 213.5, + "completions/mean_terminated_length": 213.5, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.20660584363090426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.012170430694823153, + "learning_rate": 4.424e-06, + "loss": 0.0005, + "num_tokens": 41668658.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 59.15625, + "completions/mean_terminated_length": 59.15625, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.20672133040766832, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3125, + "kl": 0.04604913157891133, + "learning_rate": 4.421999999999999e-06, + "loss": 0.0018, + "num_tokens": 41682679.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 247.65625, + "completions/mean_terminated_length": 247.65625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.2068368171844324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.014950676253647543, + "learning_rate": 4.42e-06, + "loss": 0.0006, + "num_tokens": 41702988.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 214.3125, + "completions/mean_terminated_length": 214.3125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.20695230396119643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.013813437530188821, + "learning_rate": 4.418e-06, + "loss": 0.0006, + "num_tokens": 41723734.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 106.84375, + "completions/mean_terminated_length": 106.84375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.2070677907379605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.011108596620033495, + "learning_rate": 4.416000000000001e-06, + "loss": 0.0004, + "num_tokens": 41742161.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 51.21875, + "completions/mean_terminated_length": 51.21875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.20718327751472457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.010230862069875002, + "learning_rate": 4.414e-06, + "loss": 0.0004, + "num_tokens": 41766328.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 343.03125, + "completions/mean_terminated_length": 343.03125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.20729876429148864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.011068196516134776, + "learning_rate": 4.4119999999999994e-06, + "loss": 0.0004, + "num_tokens": 41792281.0, + "reward": 3.929089069366455, + "reward_std": 0.4011319875717163, + "rewards/reward_fn/mean": 3.929089069366455, + "rewards/reward_fn/std": 0.4011319577693939, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 74.46875, + "completions/mean_terminated_length": 74.46875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.20741425106825268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.007876564039179357, + "learning_rate": 4.41e-06, + "loss": 0.0003, + "num_tokens": 41811656.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 285.78125, + "completions/mean_terminated_length": 228.9354705810547, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.20752973784501674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.012035486070089974, + "learning_rate": 4.408e-06, + "loss": 0.0005, + "num_tokens": 41833217.0, + "reward": 3.8009133338928223, + "reward_std": 0.8102613091468811, + "rewards/reward_fn/mean": 3.8009133338928223, + "rewards/reward_fn/std": 0.8102613091468811, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 178.03125, + "completions/mean_terminated_length": 178.03125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.2076452246217808, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.01204193141165888, + "learning_rate": 4.406e-06, + "loss": 0.0005, + "num_tokens": 41857090.0, + "reward": 3.9308719635009766, + "reward_std": 0.21871207654476166, + "rewards/reward_fn/mean": 3.9308719635009766, + "rewards/reward_fn/std": 0.2187120020389557, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 217.65625, + "completions/mean_terminated_length": 217.65625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.20776071139854488, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.010685711036785506, + "learning_rate": 4.404e-06, + "loss": 0.0004, + "num_tokens": 41875095.0, + "reward": 2.843012809753418, + "reward_std": 0.07969164103269577, + "rewards/reward_fn/mean": 2.843012809753418, + "rewards/reward_fn/std": 0.07969164103269577, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 208.59375, + "completions/mean_terminated_length": 208.59375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.20787619817530892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.008422512597462628, + "learning_rate": 4.4019999999999995e-06, + "loss": 0.0003, + "num_tokens": 41905130.0, + "reward": 3.9308016300201416, + "reward_std": 0.3914453983306885, + "rewards/reward_fn/mean": 3.9308016300201416, + "rewards/reward_fn/std": 0.39144545793533325, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 282.5, + "completions/mean_terminated_length": 282.5, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.20799168495207299, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.019794436055235565, + "learning_rate": 4.4e-06, + "loss": 0.0008, + "num_tokens": 41937786.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 142.0, + "completions/mean_terminated_length": 142.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.20810717172883705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.014072890800889581, + "learning_rate": 4.397999999999999e-06, + "loss": 0.0006, + "num_tokens": 41955162.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 120.0, + "completions/mean_terminated_length": 120.0, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.20822265850560112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.019580853477236815, + "learning_rate": 4.396e-06, + "loss": 0.0008, + "num_tokens": 41970874.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 146.4375, + "completions/mean_terminated_length": 146.4375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.20833814528236516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.01413525988755282, + "learning_rate": 4.394e-06, + "loss": 0.0006, + "num_tokens": 41992392.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 127.09375, + "completions/mean_terminated_length": 127.09375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.20845363205912923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.011545601431862451, + "learning_rate": 4.3920000000000005e-06, + "loss": 0.0005, + "num_tokens": 42010699.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 83.03125, + "completions/mean_terminated_length": 83.03125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.2085691188358933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.011554346558114048, + "learning_rate": 4.3899999999999995e-06, + "loss": 0.0005, + "num_tokens": 42036908.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 97.15625, + "completions/mean_terminated_length": 97.15625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.20868460561265736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.019892308599082753, + "learning_rate": 4.387999999999999e-06, + "loss": 0.0008, + "num_tokens": 42054321.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 61.53125, + "completions/mean_terminated_length": 61.53125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.2088000923894214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.0054505149400938535, + "learning_rate": 4.386e-06, + "loss": 0.0002, + "num_tokens": 42079554.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 81.28125, + "completions/mean_terminated_length": 81.28125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.20891557916618547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.008150146073603537, + "learning_rate": 4.384e-06, + "loss": 0.0003, + "num_tokens": 42098059.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 117.6875, + "completions/mean_terminated_length": 117.6875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.20903106594294954, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.011428043377236463, + "learning_rate": 4.382e-06, + "loss": 0.0005, + "num_tokens": 42114017.0, + "reward": 3.9296560287475586, + "reward_std": 0.2768020033836365, + "rewards/reward_fn/mean": 3.9296560287475586, + "rewards/reward_fn/std": 0.2768020033836365, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 179.3125, + "completions/mean_terminated_length": 179.3125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.20914655271971358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.010110203264048323, + "learning_rate": 4.3799999999999996e-06, + "loss": 0.0004, + "num_tokens": 42131915.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 254.8125, + "completions/mean_terminated_length": 254.8125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.20926203949647765, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.010891400364926085, + "learning_rate": 4.378e-06, + "loss": 0.0004, + "num_tokens": 42162181.0, + "reward": 3.029404401779175, + "reward_std": 0.1831067055463791, + "rewards/reward_fn/mean": 3.029404401779175, + "rewards/reward_fn/std": 0.1831066757440567, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 325.03125, + "completions/mean_terminated_length": 325.03125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.20937752627324172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.012834916022256948, + "learning_rate": 4.376e-06, + "loss": 0.0005, + "num_tokens": 42194054.0, + "reward": 3.756218910217285, + "reward_std": 0.5213128328323364, + "rewards/reward_fn/mean": 3.756218910217285, + "rewards/reward_fn/std": 0.5213128328323364, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 188.4375, + "completions/mean_terminated_length": 188.4375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.20949301305000578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.010494040048797615, + "learning_rate": 4.373999999999999e-06, + "loss": 0.0004, + "num_tokens": 42211604.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 290.03125, + "completions/mean_terminated_length": 290.03125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.20960849982676982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042236328125, + "kl": 0.008550839440431446, + "learning_rate": 4.372e-06, + "loss": 0.0003, + "num_tokens": 42236565.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 277.59375, + "completions/mean_terminated_length": 277.59375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.2097239866035339, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.009385083052620757, + "learning_rate": 4.37e-06, + "loss": 0.0004, + "num_tokens": 42260616.0, + "reward": 3.9440691471099854, + "reward_std": 0.220203697681427, + "rewards/reward_fn/mean": 3.9440691471099854, + "rewards/reward_fn/std": 0.220203697681427, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 225.4375, + "completions/mean_terminated_length": 225.4375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.20983947338029796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.007936494708701503, + "learning_rate": 4.368e-06, + "loss": 0.0003, + "num_tokens": 42287094.0, + "reward": 3.6972227096557617, + "reward_std": 0.42827144265174866, + "rewards/reward_fn/mean": 3.6972227096557617, + "rewards/reward_fn/std": 0.42827147245407104, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 99.40625, + "completions/mean_terminated_length": 99.40625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.20995496015706203, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.007128904544515535, + "learning_rate": 4.365999999999999e-06, + "loss": 0.0003, + "num_tokens": 42312355.0, + "reward": 3.5631375312805176, + "reward_std": 0.09435312449932098, + "rewards/reward_fn/mean": 3.5631375312805176, + "rewards/reward_fn/std": 0.09435313940048218, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 63.53125, + "completions/mean_terminated_length": 63.53125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.21007044693382607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.005719598306313856, + "learning_rate": 4.364e-06, + "loss": 0.0002, + "num_tokens": 42326292.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 213.78125, + "completions/mean_terminated_length": 213.78125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.21018593371059013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.01308052139938809, + "learning_rate": 4.362e-06, + "loss": 0.0005, + "num_tokens": 42348461.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 203.34375, + "completions/mean_terminated_length": 203.34375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.2103014204873542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.01679814633098431, + "learning_rate": 4.36e-06, + "loss": 0.0007, + "num_tokens": 42374232.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 177.71875, + "completions/mean_terminated_length": 177.71875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.21041690726411827, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.01295778570784023, + "learning_rate": 4.358e-06, + "loss": 0.0005, + "num_tokens": 42399887.0, + "reward": 3.85764741897583, + "reward_std": 0.33977338671684265, + "rewards/reward_fn/mean": 3.85764741897583, + "rewards/reward_fn/std": 0.33977338671684265, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 63.84375, + "completions/mean_terminated_length": 63.84375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.2105323940408823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.01911259748158045, + "learning_rate": 4.3559999999999995e-06, + "loss": 0.0008, + "num_tokens": 42426250.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 190.375, + "completions/mean_terminated_length": 190.375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.21064788081764638, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.011466468757134862, + "learning_rate": 4.354e-06, + "loss": 0.0005, + "num_tokens": 42448982.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1667.0, + "completions/max_terminated_length": 1667.0, + "completions/mean_length": 295.125, + "completions/mean_terminated_length": 295.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.21076336759441044, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.010060281900223345, + "learning_rate": 4.352e-06, + "loss": 0.0004, + "num_tokens": 42470906.0, + "reward": 3.9317660331726074, + "reward_std": 0.3859897553920746, + "rewards/reward_fn/mean": 3.9317660331726074, + "rewards/reward_fn/std": 0.3859897553920746, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 113.875, + "completions/mean_terminated_length": 113.875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.2108788543711745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.010258079797495157, + "learning_rate": 4.35e-06, + "loss": 0.0004, + "num_tokens": 42493366.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 246.0625, + "completions/mean_terminated_length": 246.0625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.21099434114793855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.029709925525821745, + "learning_rate": 4.348e-06, + "loss": 0.0012, + "num_tokens": 42523864.0, + "reward": 3.2074975967407227, + "reward_std": 0.0940789133310318, + "rewards/reward_fn/mean": 3.2074975967407227, + "rewards/reward_fn/std": 0.0940789133310318, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 131.0625, + "completions/mean_terminated_length": 131.0625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.21110982792470262, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "kl": 0.014393049801583402, + "learning_rate": 4.3459999999999996e-06, + "loss": 0.0006, + "num_tokens": 42554554.0, + "reward": 3.9713544845581055, + "reward_std": 0.16204403340816498, + "rewards/reward_fn/mean": 3.9713544845581055, + "rewards/reward_fn/std": 0.1620440036058426, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 118.9375, + "completions/mean_terminated_length": 118.9375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.2112253147014667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.014302221301477402, + "learning_rate": 4.344e-06, + "loss": 0.0006, + "num_tokens": 42576472.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 143.34375, + "completions/mean_terminated_length": 143.34375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.21134080147823076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.008870953366567846, + "learning_rate": 4.341999999999999e-06, + "loss": 0.0004, + "num_tokens": 42605475.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 95.40625, + "completions/mean_terminated_length": 95.40625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.2114562882549948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.01426490461017238, + "learning_rate": 4.34e-06, + "loss": 0.0006, + "num_tokens": 42616208.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 171.0625, + "completions/mean_terminated_length": 171.0625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.21157177503175886, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.01827588031301275, + "learning_rate": 4.338e-06, + "loss": 0.0007, + "num_tokens": 42644306.0, + "reward": 3.9671664237976074, + "reward_std": 0.18573513627052307, + "rewards/reward_fn/mean": 3.9671664237976074, + "rewards/reward_fn/std": 0.18573518097400665, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 189.9375, + "completions/mean_terminated_length": 189.9375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.21168726180852293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.013601631901110522, + "learning_rate": 4.3360000000000005e-06, + "loss": 0.0005, + "num_tokens": 42662896.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 290.78125, + "completions/mean_terminated_length": 290.78125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.211802748585287, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.01150635106023401, + "learning_rate": 4.3339999999999995e-06, + "loss": 0.0005, + "num_tokens": 42689321.0, + "reward": 3.1169025897979736, + "reward_std": 0.5055363178253174, + "rewards/reward_fn/mean": 3.1169025897979736, + "rewards/reward_fn/std": 0.5055363178253174, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 189.6875, + "completions/mean_terminated_length": 189.6875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.21191823536205104, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "kl": 0.012628969881916419, + "learning_rate": 4.331999999999999e-06, + "loss": 0.0005, + "num_tokens": 42707423.0, + "reward": 3.163019895553589, + "reward_std": 0.573836624622345, + "rewards/reward_fn/mean": 3.163019895553589, + "rewards/reward_fn/std": 0.5738366842269897, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 151.28125, + "completions/mean_terminated_length": 151.28125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.2120337221388151, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.015106910024769604, + "learning_rate": 4.33e-06, + "loss": 0.0006, + "num_tokens": 42731464.0, + "reward": 3.9705567359924316, + "reward_std": 0.16655686497688293, + "rewards/reward_fn/mean": 3.9705567359924316, + "rewards/reward_fn/std": 0.16655683517456055, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 105.09375, + "completions/mean_terminated_length": 105.09375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.21214920891557917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.011819686587841716, + "learning_rate": 4.328e-06, + "loss": 0.0005, + "num_tokens": 42749387.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 188.9375, + "completions/mean_terminated_length": 188.9375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.2122646956923432, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.01380706237978302, + "learning_rate": 4.326e-06, + "loss": 0.0006, + "num_tokens": 42779305.0, + "reward": 3.979644536972046, + "reward_std": 0.11514782160520554, + "rewards/reward_fn/mean": 3.979644536972046, + "rewards/reward_fn/std": 0.11514779925346375, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 138.84375, + "completions/mean_terminated_length": 138.84375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.21238018246910728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.014671814526082017, + "learning_rate": 4.324e-06, + "loss": 0.0006, + "num_tokens": 42795972.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 153.90625, + "completions/mean_terminated_length": 153.90625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.21249566924587135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.012045157462125644, + "learning_rate": 4.3219999999999994e-06, + "loss": 0.0005, + "num_tokens": 42823553.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 187.53125, + "completions/mean_terminated_length": 187.53125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.21261115602263542, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.01632811574381776, + "learning_rate": 4.32e-06, + "loss": 0.0007, + "num_tokens": 42852434.0, + "reward": 3.5986552238464355, + "reward_std": 0.8488426804542542, + "rewards/reward_fn/mean": 3.5986552238464355, + "rewards/reward_fn/std": 0.8488426804542542, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 86.96875, + "completions/mean_terminated_length": 86.96875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.21272664279939946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.010907702671829611, + "learning_rate": 4.317999999999999e-06, + "loss": 0.0004, + "num_tokens": 42884433.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 309.3125, + "completions/mean_terminated_length": 253.22579956054688, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.21284212957616352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.765625, + "kl": 0.018401009583612904, + "learning_rate": 4.316e-06, + "loss": 0.0007, + "num_tokens": 42912891.0, + "reward": 2.9655332565307617, + "reward_std": 0.5544604659080505, + "rewards/reward_fn/mean": 2.9655332565307617, + "rewards/reward_fn/std": 0.5544604063034058, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 60.71875, + "completions/mean_terminated_length": 60.71875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.2129576163529276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.008524849739842466, + "learning_rate": 4.314e-06, + "loss": 0.0003, + "num_tokens": 42924498.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 121.875, + "completions/mean_terminated_length": 121.875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.21307310312969166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.00910432620003121, + "learning_rate": 4.312e-06, + "loss": 0.0004, + "num_tokens": 42939118.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 93.59375, + "completions/mean_terminated_length": 93.59375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.2131885899064557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.003916982799637481, + "learning_rate": 4.309999999999999e-06, + "loss": 0.0002, + "num_tokens": 42952193.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 421.84375, + "completions/mean_terminated_length": 421.84375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.21330407668321977, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.011833253171062097, + "learning_rate": 4.307999999999999e-06, + "loss": 0.0005, + "num_tokens": 42980860.0, + "reward": 3.609062671661377, + "reward_std": 0.8492540717124939, + "rewards/reward_fn/mean": 3.609062671661377, + "rewards/reward_fn/std": 0.8492540717124939, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 175.34375, + "completions/mean_terminated_length": 175.34375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.21341956345998384, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.019787766301305965, + "learning_rate": 4.306e-06, + "loss": 0.0008, + "num_tokens": 43008999.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 158.0, + "completions/mean_terminated_length": 158.0, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.2135350502367479, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.01014959454187192, + "learning_rate": 4.304e-06, + "loss": 0.0004, + "num_tokens": 43037095.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 56.375, + "completions/mean_terminated_length": 56.375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.21365053701351194, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.8125, + "kl": 0.007390781905996846, + "learning_rate": 4.302e-06, + "loss": 0.0003, + "num_tokens": 43063315.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.213766023790276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.006682103579805698, + "learning_rate": 4.2999999999999995e-06, + "loss": 0.0003, + "num_tokens": 43092348.0, + "reward": 3.9293646812438965, + "reward_std": 0.39957377314567566, + "rewards/reward_fn/mean": 3.9293646812438965, + "rewards/reward_fn/std": 0.39957377314567566, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 158.9375, + "completions/mean_terminated_length": 158.9375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.21388151056704008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.010143837163923308, + "learning_rate": 4.298e-06, + "loss": 0.0004, + "num_tokens": 43112058.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 70.75, + "completions/mean_terminated_length": 70.75, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.21399699734380415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.008720550245925551, + "learning_rate": 4.296e-06, + "loss": 0.0003, + "num_tokens": 43131794.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 46.75, + "completions/mean_terminated_length": 46.75, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.2141124841205682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.012841294032114092, + "learning_rate": 4.293999999999999e-06, + "loss": 0.0005, + "num_tokens": 43148938.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 119.96875, + "completions/mean_terminated_length": 119.96875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.21422797089733225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.01119628781452775, + "learning_rate": 4.292e-06, + "loss": 0.0004, + "num_tokens": 43164905.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 171.65625, + "completions/mean_terminated_length": 171.65625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.21434345767409632, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.008337539300555363, + "learning_rate": 4.29e-06, + "loss": 0.0003, + "num_tokens": 43193438.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 204.5, + "completions/mean_terminated_length": 204.5, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.2144589444508604, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.009882940663374029, + "learning_rate": 4.288e-06, + "loss": 0.0004, + "num_tokens": 43219310.0, + "reward": 3.9460501670837402, + "reward_std": 0.21266350150108337, + "rewards/reward_fn/mean": 3.9460501670837402, + "rewards/reward_fn/std": 0.21266351640224457, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 97.34375, + "completions/mean_terminated_length": 97.34375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.21457443122762443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.01288459260831587, + "learning_rate": 4.285999999999999e-06, + "loss": 0.0005, + "num_tokens": 43235097.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 130.90625, + "completions/mean_terminated_length": 130.90625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.2146899180043885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.013714176559005864, + "learning_rate": 4.284e-06, + "loss": 0.0005, + "num_tokens": 43257398.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 316.5625, + "completions/mean_terminated_length": 316.5625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.21480540478115256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.009952753476682119, + "learning_rate": 4.282e-06, + "loss": 0.0004, + "num_tokens": 43292520.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 145.90625, + "completions/mean_terminated_length": 145.90625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.21492089155791663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.01056372079619905, + "learning_rate": 4.28e-06, + "loss": 0.0004, + "num_tokens": 43315205.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 144.09375, + "completions/mean_terminated_length": 144.09375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.21503637833468067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.009353386660222895, + "learning_rate": 4.2779999999999995e-06, + "loss": 0.0004, + "num_tokens": 43335048.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 215.75, + "completions/mean_terminated_length": 215.75, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.21515186511144474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.017351950809825212, + "learning_rate": 4.275999999999999e-06, + "loss": 0.0007, + "num_tokens": 43362912.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 123.65625, + "completions/mean_terminated_length": 123.65625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.2152673518882088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.008594665268901736, + "learning_rate": 4.274e-06, + "loss": 0.0003, + "num_tokens": 43378773.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 94.375, + "completions/mean_terminated_length": 94.375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.21538283866497285, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.006934229066246189, + "learning_rate": 4.272e-06, + "loss": 0.0003, + "num_tokens": 43403041.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 453.5, + "completions/mean_terminated_length": 402.06451416015625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.21549832544173692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.010925251117441803, + "learning_rate": 4.27e-06, + "loss": 0.0004, + "num_tokens": 43434449.0, + "reward": 3.7635626792907715, + "reward_std": 0.8228526711463928, + "rewards/reward_fn/mean": 3.7635626792907715, + "rewards/reward_fn/std": 0.8228526711463928, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 309.9375, + "completions/mean_terminated_length": 253.87095642089844, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.21561381221850098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.56640625, + "kl": 0.012905614283226896, + "learning_rate": 4.268e-06, + "loss": 0.0005, + "num_tokens": 43456687.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 109.65625, + "completions/mean_terminated_length": 109.65625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.21572929899526505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.008687979330716189, + "learning_rate": 4.2659999999999995e-06, + "loss": 0.0003, + "num_tokens": 43475972.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 188.53125, + "completions/mean_terminated_length": 188.53125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.2158447857720291, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.012729593814583495, + "learning_rate": 4.264e-06, + "loss": 0.0005, + "num_tokens": 43498869.0, + "reward": 3.4955945014953613, + "reward_std": 0.6502060890197754, + "rewards/reward_fn/mean": 3.4955945014953613, + "rewards/reward_fn/std": 0.6502061486244202, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 69.6875, + "completions/mean_terminated_length": 69.6875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.21596027254879316, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2216796875, + "kl": 0.015325111868150998, + "learning_rate": 4.261999999999999e-06, + "loss": 0.0006, + "num_tokens": 43521835.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 75.875, + "completions/mean_terminated_length": 75.875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.21607575932555723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.005982143020446529, + "learning_rate": 4.26e-06, + "loss": 0.0002, + "num_tokens": 43550247.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 279.21875, + "completions/mean_terminated_length": 279.21875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.2161912461023213, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.01476986640773248, + "learning_rate": 4.258e-06, + "loss": 0.0006, + "num_tokens": 43575662.0, + "reward": 3.2002599239349365, + "reward_std": 0.6037581562995911, + "rewards/reward_fn/mean": 3.2002599239349365, + "rewards/reward_fn/std": 0.6037582159042358, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 59.75, + "completions/mean_terminated_length": 59.75, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.21630673287908533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.007573175444122171, + "learning_rate": 4.256e-06, + "loss": 0.0003, + "num_tokens": 43604870.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 80.78125, + "completions/mean_terminated_length": 80.78125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.2164222196558494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.005041345462814206, + "learning_rate": 4.253999999999999e-06, + "loss": 0.0002, + "num_tokens": 43630559.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 64.8125, + "completions/mean_terminated_length": 64.8125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.21653770643261347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.008895011467757286, + "learning_rate": 4.251999999999999e-06, + "loss": 0.0004, + "num_tokens": 43657913.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 224.84375, + "completions/mean_terminated_length": 224.84375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.21665319320937754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.010337062733015046, + "learning_rate": 4.25e-06, + "loss": 0.0004, + "num_tokens": 43682644.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 171.21875, + "completions/mean_terminated_length": 171.21875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.21676867998614158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.011276393066509627, + "learning_rate": 4.248e-06, + "loss": 0.0005, + "num_tokens": 43705115.0, + "reward": 3.930624485015869, + "reward_std": 0.39244699478149414, + "rewards/reward_fn/mean": 3.930624485015869, + "rewards/reward_fn/std": 0.39244696497917175, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 197.0, + "completions/mean_terminated_length": 197.0, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.21688416676290564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.015604651300236583, + "learning_rate": 4.2460000000000005e-06, + "loss": 0.0006, + "num_tokens": 43736795.0, + "reward": 3.958207607269287, + "reward_std": 0.13322047889232635, + "rewards/reward_fn/mean": 3.958207607269287, + "rewards/reward_fn/std": 0.13322046399116516, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 64.0625, + "completions/mean_terminated_length": 64.0625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.2169996535396697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.008347001305082813, + "learning_rate": 4.2439999999999995e-06, + "loss": 0.0003, + "num_tokens": 43750557.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 174.09375, + "completions/mean_terminated_length": 174.09375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.21711514031643378, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.011852546958834864, + "learning_rate": 4.242e-06, + "loss": 0.0005, + "num_tokens": 43778400.0, + "reward": 3.0563790798187256, + "reward_std": 0.05725564807653427, + "rewards/reward_fn/mean": 3.0563790798187256, + "rewards/reward_fn/std": 0.05725563317537308, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.21723062709319782, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.00932472497515846, + "learning_rate": 4.24e-06, + "loss": 0.0004, + "num_tokens": 43800388.0, + "reward": 3.9290523529052734, + "reward_std": 0.27918949723243713, + "rewards/reward_fn/mean": 3.9290523529052734, + "rewards/reward_fn/std": 0.27918946743011475, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 100.875, + "completions/mean_terminated_length": 100.875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.2173461138699619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.005017005516492645, + "learning_rate": 4.238e-06, + "loss": 0.0002, + "num_tokens": 43824064.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 246.46875, + "completions/mean_terminated_length": 246.46875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.21746160064672596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.009618787087674718, + "learning_rate": 4.236e-06, + "loss": 0.0004, + "num_tokens": 43847183.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 259.4375, + "completions/mean_terminated_length": 259.4375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.21757708742349002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.009763890935573727, + "learning_rate": 4.234e-06, + "loss": 0.0004, + "num_tokens": 43885149.0, + "reward": 2.4710745811462402, + "reward_std": 0.4260454773902893, + "rewards/reward_fn/mean": 2.4710745811462402, + "rewards/reward_fn/std": 0.4260455071926117, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 108.21875, + "completions/mean_terminated_length": 108.21875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.21769257420025406, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.010788177336507943, + "learning_rate": 4.232e-06, + "loss": 0.0004, + "num_tokens": 43907652.0, + "reward": 3.2983016967773438, + "reward_std": 0.0731206089258194, + "rewards/reward_fn/mean": 3.2983016967773438, + "rewards/reward_fn/std": 0.07312063127756119, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 136.0, + "completions/mean_terminated_length": 136.0, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.21780806097701813, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.026370148145360872, + "learning_rate": 4.23e-06, + "loss": 0.0011, + "num_tokens": 43935460.0, + "reward": 3.2962985038757324, + "reward_std": 0.24316607415676117, + "rewards/reward_fn/mean": 3.2962985038757324, + "rewards/reward_fn/std": 0.24316613376140594, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 174.46875, + "completions/mean_terminated_length": 174.46875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.2179235477537822, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.008933976445405278, + "learning_rate": 4.227999999999999e-06, + "loss": 0.0004, + "num_tokens": 43960595.0, + "reward": 3.9486947059631348, + "reward_std": 0.20632795989513397, + "rewards/reward_fn/mean": 3.9486947059631348, + "rewards/reward_fn/std": 0.20632793009281158, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 67.53125, + "completions/mean_terminated_length": 67.53125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.21803903453054627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15625, + "kl": 0.010799019117257558, + "learning_rate": 4.226e-06, + "loss": 0.0004, + "num_tokens": 43974564.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 164.0, + "completions/mean_terminated_length": 164.0, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.2181545213073103, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.018183566309744492, + "learning_rate": 4.224e-06, + "loss": 0.0007, + "num_tokens": 43998756.0, + "reward": 2.921217918395996, + "reward_std": 0.05472329258918762, + "rewards/reward_fn/mean": 2.921217918395996, + "rewards/reward_fn/std": 0.054723307490348816, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 211.75, + "completions/mean_terminated_length": 211.75, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.21827000808407437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.009329752778285183, + "learning_rate": 4.222e-06, + "loss": 0.0004, + "num_tokens": 44018076.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 275.34375, + "completions/mean_terminated_length": 275.34375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.21838549486083844, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.008234267472289503, + "learning_rate": 4.219999999999999e-06, + "loss": 0.0003, + "num_tokens": 44042023.0, + "reward": 3.9271492958068848, + "reward_std": 0.4121064245700836, + "rewards/reward_fn/mean": 3.9271492958068848, + "rewards/reward_fn/std": 0.41210636496543884, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 195.03125, + "completions/mean_terminated_length": 195.03125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.21850098163760248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.011811924530775286, + "learning_rate": 4.218e-06, + "loss": 0.0005, + "num_tokens": 44070024.0, + "reward": 3.8891348838806152, + "reward_std": 0.2352144569158554, + "rewards/reward_fn/mean": 3.8891348838806152, + "rewards/reward_fn/std": 0.2352144718170166, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 278.40625, + "completions/mean_terminated_length": 278.40625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.21861646841436655, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.009248448579455726, + "learning_rate": 4.216e-06, + "loss": 0.0004, + "num_tokens": 44102421.0, + "reward": 3.6784303188323975, + "reward_std": 0.7844281792640686, + "rewards/reward_fn/mean": 3.6784303188323975, + "rewards/reward_fn/std": 0.7844281792640686, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 386.46875, + "completions/mean_terminated_length": 386.46875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.21873195519113062, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.008383072257856838, + "learning_rate": 4.214e-06, + "loss": 0.0003, + "num_tokens": 44124644.0, + "reward": 3.856736421585083, + "reward_std": 0.5637442469596863, + "rewards/reward_fn/mean": 3.856736421585083, + "rewards/reward_fn/std": 0.5637442469596863, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 301.28125, + "completions/mean_terminated_length": 301.28125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.21884744196789468, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.009564951920765452, + "learning_rate": 4.212e-06, + "loss": 0.0004, + "num_tokens": 44142285.0, + "reward": 3.2951531410217285, + "reward_std": 0.7395024299621582, + "rewards/reward_fn/mean": 3.2951531410217285, + "rewards/reward_fn/std": 0.7395023703575134, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 357.78125, + "completions/mean_terminated_length": 357.78125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.21896292874465872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.009823907603276893, + "learning_rate": 4.2099999999999995e-06, + "loss": 0.0004, + "num_tokens": 44177766.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 154.375, + "completions/mean_terminated_length": 154.375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.2190784155214228, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.007010689223534428, + "learning_rate": 4.208e-06, + "loss": 0.0003, + "num_tokens": 44193234.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 193.4375, + "completions/mean_terminated_length": 193.4375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.21919390229818686, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.01590860923170112, + "learning_rate": 4.206e-06, + "loss": 0.0006, + "num_tokens": 44217760.0, + "reward": 3.800774574279785, + "reward_std": 0.38633644580841064, + "rewards/reward_fn/mean": 3.800774574279785, + "rewards/reward_fn/std": 0.38633644580841064, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 223.1875, + "completions/mean_terminated_length": 223.1875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.21930938907495093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.008133652121614432, + "learning_rate": 4.204e-06, + "loss": 0.0003, + "num_tokens": 44248006.0, + "reward": 3.8128373622894287, + "reward_std": 0.609617292881012, + "rewards/reward_fn/mean": 3.8128373622894287, + "rewards/reward_fn/std": 0.6096172332763672, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 151.71875, + "completions/mean_terminated_length": 151.71875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.21942487585171497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.011044255370507017, + "learning_rate": 4.202e-06, + "loss": 0.0004, + "num_tokens": 44264221.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 87.9375, + "completions/mean_terminated_length": 87.9375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.21954036262847904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.011166050579049625, + "learning_rate": 4.2e-06, + "loss": 0.0004, + "num_tokens": 44284731.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 84.78125, + "completions/mean_terminated_length": 84.78125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.2196558494052431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.023304144240682945, + "learning_rate": 4.198e-06, + "loss": 0.0009, + "num_tokens": 44303988.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 122.78125, + "completions/mean_terminated_length": 122.78125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.21977133618200717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.013566521563916467, + "learning_rate": 4.195999999999999e-06, + "loss": 0.0005, + "num_tokens": 44320173.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 100.25, + "completions/mean_terminated_length": 100.25, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.2198868229587712, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.0076249354824540205, + "learning_rate": 4.194e-06, + "loss": 0.0003, + "num_tokens": 44343061.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 112.8125, + "completions/mean_terminated_length": 112.8125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.22000230973553528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.015780807225382887, + "learning_rate": 4.192e-06, + "loss": 0.0006, + "num_tokens": 44369967.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 70.03125, + "completions/mean_terminated_length": 70.03125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.22011779651229935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05859375, + "kl": 0.005781595093139913, + "learning_rate": 4.1900000000000005e-06, + "loss": 0.0002, + "num_tokens": 44386896.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 173.5, + "completions/mean_terminated_length": 173.5, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.22023328328906341, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.025377428508363664, + "learning_rate": 4.1879999999999995e-06, + "loss": 0.001, + "num_tokens": 44410496.0, + "reward": 3.647341012954712, + "reward_std": 0.3949136435985565, + "rewards/reward_fn/mean": 3.647341012954712, + "rewards/reward_fn/std": 0.3949136435985565, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 75.78125, + "completions/mean_terminated_length": 75.78125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.22034877006582745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.007215381843707291, + "learning_rate": 4.185999999999999e-06, + "loss": 0.0003, + "num_tokens": 44442425.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 279.78125, + "completions/mean_terminated_length": 279.78125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.22046425684259152, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.009284126412239857, + "learning_rate": 4.184e-06, + "loss": 0.0004, + "num_tokens": 44464690.0, + "reward": 3.891789436340332, + "reward_std": 0.44981521368026733, + "rewards/reward_fn/mean": 3.891789436340332, + "rewards/reward_fn/std": 0.44981521368026733, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 237.34375, + "completions/mean_terminated_length": 237.34375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.2205797436193556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05126953125, + "kl": 0.011898356518941, + "learning_rate": 4.182e-06, + "loss": 0.0005, + "num_tokens": 44486525.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 260.375, + "completions/mean_terminated_length": 260.375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.22069523039611966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039306640625, + "kl": 0.006667347755865194, + "learning_rate": 4.18e-06, + "loss": 0.0003, + "num_tokens": 44510025.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 141.46875, + "completions/mean_terminated_length": 141.46875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.2208107171728837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.006467427196184872, + "learning_rate": 4.178e-06, + "loss": 0.0003, + "num_tokens": 44536248.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 97.3125, + "completions/mean_terminated_length": 97.3125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.22092620394964776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.00863553579984, + "learning_rate": 4.176e-06, + "loss": 0.0003, + "num_tokens": 44558498.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 277.59375, + "completions/mean_terminated_length": 277.59375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.22104169072641183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.014649118165834807, + "learning_rate": 4.174e-06, + "loss": 0.0006, + "num_tokens": 44583285.0, + "reward": 3.9327030181884766, + "reward_std": 0.38068887591362, + "rewards/reward_fn/mean": 3.9327030181884766, + "rewards/reward_fn/std": 0.38068887591362, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 86.03125, + "completions/mean_terminated_length": 86.03125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.2211571775031759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.006000510380545165, + "learning_rate": 4.171999999999999e-06, + "loss": 0.0002, + "num_tokens": 44609366.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 212.46875, + "completions/mean_terminated_length": 212.46875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.22127266427993994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.013879378660931252, + "learning_rate": 4.17e-06, + "loss": 0.0006, + "num_tokens": 44638181.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 119.0, + "completions/mean_terminated_length": 119.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.221388151056704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.008410844013269525, + "learning_rate": 4.168e-06, + "loss": 0.0003, + "num_tokens": 44665733.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 203.09375, + "completions/mean_terminated_length": 203.09375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.22150363783346808, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.00980346749565797, + "learning_rate": 4.1660000000000004e-06, + "loss": 0.0004, + "num_tokens": 44691048.0, + "reward": 3.9041500091552734, + "reward_std": 0.4222012758255005, + "rewards/reward_fn/mean": 3.9041500091552734, + "rewards/reward_fn/std": 0.4222012460231781, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 195.03125, + "completions/mean_terminated_length": 195.03125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.22161912461023212, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.009139036032138392, + "learning_rate": 4.1639999999999994e-06, + "loss": 0.0004, + "num_tokens": 44709897.0, + "reward": 3.662044048309326, + "reward_std": 0.3451992869377136, + "rewards/reward_fn/mean": 3.662044048309326, + "rewards/reward_fn/std": 0.34519925713539124, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 94.84375, + "completions/mean_terminated_length": 94.84375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.22173461138699618, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.007361563177255448, + "learning_rate": 4.162e-06, + "loss": 0.0003, + "num_tokens": 44736100.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 183.375, + "completions/mean_terminated_length": 183.375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.22185009816376025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.007981653427123092, + "learning_rate": 4.16e-06, + "loss": 0.0003, + "num_tokens": 44752880.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 78.6875, + "completions/mean_terminated_length": 78.6875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.22196558494052432, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.009656504975282587, + "learning_rate": 4.158e-06, + "loss": 0.0004, + "num_tokens": 44771174.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 166.90625, + "completions/mean_terminated_length": 166.90625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.22208107171728836, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.015025472443085164, + "learning_rate": 4.156e-06, + "loss": 0.0006, + "num_tokens": 44787491.0, + "reward": 3.196183681488037, + "reward_std": 0.0348983071744442, + "rewards/reward_fn/mean": 3.196183681488037, + "rewards/reward_fn/std": 0.034898318350315094, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 136.9375, + "completions/mean_terminated_length": 136.9375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.22219655849405243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.008416337521339301, + "learning_rate": 4.1539999999999995e-06, + "loss": 0.0003, + "num_tokens": 44809601.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 96.21875, + "completions/mean_terminated_length": 96.21875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.2223120452708165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.009806832997128367, + "learning_rate": 4.152e-06, + "loss": 0.0004, + "num_tokens": 44831336.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 148.0, + "completions/mean_terminated_length": 148.0, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.22242753204758056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.011412996056606062, + "learning_rate": 4.15e-06, + "loss": 0.0005, + "num_tokens": 44858920.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 297.40625, + "completions/mean_terminated_length": 297.40625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.2225430188243446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.012076094455551356, + "learning_rate": 4.147999999999999e-06, + "loss": 0.0005, + "num_tokens": 44884053.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 136.09375, + "completions/mean_terminated_length": 136.09375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.22265850560110867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.008169147611624794, + "learning_rate": 4.146e-06, + "loss": 0.0003, + "num_tokens": 44920824.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 360.75, + "completions/mean_terminated_length": 360.75, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.22277399237787274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.009722488845000044, + "learning_rate": 4.144e-06, + "loss": 0.0004, + "num_tokens": 44947920.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 152.1875, + "completions/mean_terminated_length": 152.1875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.2228894791546368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.009877211785351392, + "learning_rate": 4.142e-06, + "loss": 0.0004, + "num_tokens": 44964406.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 131.09375, + "completions/mean_terminated_length": 131.09375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.22300496593140084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.00993640223168768, + "learning_rate": 4.139999999999999e-06, + "loss": 0.0004, + "num_tokens": 44980921.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 148.03125, + "completions/mean_terminated_length": 148.03125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.2231204527081649, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.015921497673843987, + "learning_rate": 4.138e-06, + "loss": 0.0006, + "num_tokens": 45008538.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 102.4375, + "completions/mean_terminated_length": 102.4375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.22323593948492898, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.008571565354941413, + "learning_rate": 4.136e-06, + "loss": 0.0003, + "num_tokens": 45023688.0, + "reward": 3.9620330333709717, + "reward_std": 0.2147737443447113, + "rewards/reward_fn/mean": 3.9620330333709717, + "rewards/reward_fn/std": 0.2147737592458725, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 118.625, + "completions/mean_terminated_length": 118.625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.22335142626169305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.011363780402461998, + "learning_rate": 4.1340000000000006e-06, + "loss": 0.0005, + "num_tokens": 45042108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 175.65625, + "completions/mean_terminated_length": 175.65625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.2234669130384571, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.012330671728705056, + "learning_rate": 4.1319999999999996e-06, + "loss": 0.0005, + "num_tokens": 45060369.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 108.5, + "completions/mean_terminated_length": 108.5, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.22358239981522116, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.037629615304467734, + "learning_rate": 4.129999999999999e-06, + "loss": 0.0015, + "num_tokens": 45077569.0, + "reward": 3.0130577087402344, + "reward_std": 0.047634419053792953, + "rewards/reward_fn/mean": 3.0130577087402344, + "rewards/reward_fn/std": 0.04763442650437355, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 211.1875, + "completions/mean_terminated_length": 211.1875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.22369788659198522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.010167489614104852, + "learning_rate": 4.128e-06, + "loss": 0.0004, + "num_tokens": 45096231.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 353.09375, + "completions/mean_terminated_length": 353.09375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.2238133733687493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.007986412805621512, + "learning_rate": 4.126e-06, + "loss": 0.0003, + "num_tokens": 45122634.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 100.3125, + "completions/mean_terminated_length": 100.3125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.22392886014551333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.007885939718107693, + "learning_rate": 4.124e-06, + "loss": 0.0003, + "num_tokens": 45137908.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 200.625, + "completions/mean_terminated_length": 200.625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.2240443469222774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.012187518019345589, + "learning_rate": 4.122e-06, + "loss": 0.0005, + "num_tokens": 45159656.0, + "reward": 3.8897085189819336, + "reward_std": 0.2993803918361664, + "rewards/reward_fn/mean": 3.8897085189819336, + "rewards/reward_fn/std": 0.2993803918361664, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 159.09375, + "completions/mean_terminated_length": 159.09375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.22415983369904147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.010135256015928462, + "learning_rate": 4.1199999999999995e-06, + "loss": 0.0004, + "num_tokens": 45180971.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 307.5625, + "completions/mean_terminated_length": 307.5625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.22427532047580553, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.011847384186694399, + "learning_rate": 4.118e-06, + "loss": 0.0005, + "num_tokens": 45210237.0, + "reward": 2.9242167472839355, + "reward_std": 0.20478574931621552, + "rewards/reward_fn/mean": 2.9242167472839355, + "rewards/reward_fn/std": 0.2047857940196991, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 70.375, + "completions/mean_terminated_length": 70.375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.22439080725256957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.0040936519944807515, + "learning_rate": 4.115999999999999e-06, + "loss": 0.0002, + "num_tokens": 45227401.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 84.21875, + "completions/mean_terminated_length": 84.21875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.22450629402933364, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "kl": 0.009261766430427087, + "learning_rate": 4.114e-06, + "loss": 0.0004, + "num_tokens": 45242128.0, + "reward": 2.7941248416900635, + "reward_std": 0.03137638792395592, + "rewards/reward_fn/mean": 2.7941248416900635, + "rewards/reward_fn/std": 0.03137640655040741, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 382.1875, + "completions/mean_terminated_length": 382.1875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.2246217808060977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03955078125, + "kl": 0.009795742640562821, + "learning_rate": 4.112e-06, + "loss": 0.0004, + "num_tokens": 45264790.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 64.0, + "completions/max_terminated_length": 64.0, + "completions/mean_length": 44.03125, + "completions/mean_terminated_length": 44.03125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.22473726758286175, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.96875, + "kl": 0.02927931703743525, + "learning_rate": 4.1100000000000005e-06, + "loss": 0.0012, + "num_tokens": 45292695.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 117.90625, + "completions/mean_terminated_length": 117.90625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.22485275435962582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.0179244413448032, + "learning_rate": 4.1079999999999995e-06, + "loss": 0.0007, + "num_tokens": 45315636.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 165.25, + "completions/mean_terminated_length": 165.25, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.22496824113638988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.00855086141382344, + "learning_rate": 4.105999999999999e-06, + "loss": 0.0003, + "num_tokens": 45340092.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 431.65625, + "completions/mean_terminated_length": 379.51611328125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.22508372791315395, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6484375, + "kl": 0.0067388145325821824, + "learning_rate": 4.104e-06, + "loss": 0.0003, + "num_tokens": 45369169.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 76.09375, + "completions/mean_terminated_length": 76.09375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.225199214689918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.006148141688754549, + "learning_rate": 4.102e-06, + "loss": 0.0002, + "num_tokens": 45388660.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 101.125, + "completions/mean_terminated_length": 101.125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.22531470146668206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.004246693487402808, + "learning_rate": 4.1e-06, + "loss": 0.0002, + "num_tokens": 45415640.0, + "reward": 3.6428513526916504, + "reward_std": 0.05395352095365524, + "rewards/reward_fn/mean": 3.6428513526916504, + "rewards/reward_fn/std": 0.05395353212952614, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 87.59375, + "completions/mean_terminated_length": 87.59375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.22543018824344613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.013357613155676518, + "learning_rate": 4.0979999999999996e-06, + "loss": 0.0005, + "num_tokens": 45439787.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 220.03125, + "completions/mean_terminated_length": 220.03125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.2255456750202102, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.012615345083759166, + "learning_rate": 4.096e-06, + "loss": 0.0005, + "num_tokens": 45465612.0, + "reward": 3.5186266899108887, + "reward_std": 0.600145161151886, + "rewards/reward_fn/mean": 3.5186266899108887, + "rewards/reward_fn/std": 0.600145161151886, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 93.75, + "completions/mean_terminated_length": 93.75, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.22566116179697424, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.625, + "kl": 0.008121311762806727, + "learning_rate": 4.094e-06, + "loss": 0.0003, + "num_tokens": 45488772.0, + "reward": 3.6624910831451416, + "reward_std": 0.3893602192401886, + "rewards/reward_fn/mean": 3.6624910831451416, + "rewards/reward_fn/std": 0.3893602192401886, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 112.15625, + "completions/mean_terminated_length": 112.15625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.2257766485737383, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "kl": 0.021627139853080735, + "learning_rate": 4.091999999999999e-06, + "loss": 0.0009, + "num_tokens": 45517033.0, + "reward": 3.977522850036621, + "reward_std": 0.12715056538581848, + "rewards/reward_fn/mean": 3.977522850036621, + "rewards/reward_fn/std": 0.1271505504846573, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 63.375, + "completions/mean_terminated_length": 63.375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.22589213535050237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.010554315769695677, + "learning_rate": 4.09e-06, + "loss": 0.0004, + "num_tokens": 45532213.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 111.90625, + "completions/mean_terminated_length": 111.90625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.22600762212726644, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.796875, + "kl": 0.016239067263086326, + "learning_rate": 4.088e-06, + "loss": 0.0006, + "num_tokens": 45551474.0, + "reward": 3.3905839920043945, + "reward_std": 0.5823779106140137, + "rewards/reward_fn/mean": 3.3905839920043945, + "rewards/reward_fn/std": 0.5823779106140137, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 167.78125, + "completions/mean_terminated_length": 167.78125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.22612310890403048, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.011172865575645119, + "learning_rate": 4.086e-06, + "loss": 0.0004, + "num_tokens": 45571819.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 162.96875, + "completions/mean_terminated_length": 162.96875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.22623859568079455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.008045314702030737, + "learning_rate": 4.083999999999999e-06, + "loss": 0.0003, + "num_tokens": 45595210.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 139.59375, + "completions/mean_terminated_length": 139.59375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.22635408245755861, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.012511431392340455, + "learning_rate": 4.082e-06, + "loss": 0.0005, + "num_tokens": 45617917.0, + "reward": 3.9751501083374023, + "reward_std": 0.14057141542434692, + "rewards/reward_fn/mean": 3.9751501083374023, + "rewards/reward_fn/std": 0.14057140052318573, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 328.5, + "completions/mean_terminated_length": 328.5, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.22646956923432268, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.010587060547550209, + "learning_rate": 4.08e-06, + "loss": 0.0004, + "num_tokens": 45651629.0, + "reward": 3.282343864440918, + "reward_std": 0.4249555170536041, + "rewards/reward_fn/mean": 3.282343864440918, + "rewards/reward_fn/std": 0.4249555170536041, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 162.9375, + "completions/mean_terminated_length": 162.9375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.22658505601108672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.008242130934377201, + "learning_rate": 4.078e-06, + "loss": 0.0003, + "num_tokens": 45672395.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 229.4375, + "completions/mean_terminated_length": 229.4375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.2267005427878508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.012459871592000127, + "learning_rate": 4.076e-06, + "loss": 0.0005, + "num_tokens": 45691673.0, + "reward": 3.976745367050171, + "reward_std": 0.13154824078083038, + "rewards/reward_fn/mean": 3.976745367050171, + "rewards/reward_fn/std": 0.13154828548431396, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 159.03125, + "completions/mean_terminated_length": 159.03125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.22681602956461486, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.025624415779020637, + "learning_rate": 4.0739999999999994e-06, + "loss": 0.001, + "num_tokens": 45707994.0, + "reward": 3.0140910148620605, + "reward_std": 0.12873949110507965, + "rewards/reward_fn/mean": 3.0140910148620605, + "rewards/reward_fn/std": 0.12873950600624084, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 170.84375, + "completions/mean_terminated_length": 170.84375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.22693151634137892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.011063902085879818, + "learning_rate": 4.072e-06, + "loss": 0.0004, + "num_tokens": 45724725.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 90.03125, + "completions/mean_terminated_length": 90.03125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.22704700311814296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.009139846843027044, + "learning_rate": 4.07e-06, + "loss": 0.0004, + "num_tokens": 45755542.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 165.28125, + "completions/mean_terminated_length": 165.28125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.22716248989490703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.008737404859857634, + "learning_rate": 4.068e-06, + "loss": 0.0003, + "num_tokens": 45777183.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 68.53125, + "completions/mean_terminated_length": 68.53125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.2272779766716711, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1728515625, + "kl": 0.01315758800774347, + "learning_rate": 4.066e-06, + "loss": 0.0005, + "num_tokens": 45799536.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 263.09375, + "completions/mean_terminated_length": 263.09375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.22739346344843517, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.012532126442238223, + "learning_rate": 4.0639999999999995e-06, + "loss": 0.0005, + "num_tokens": 45819187.0, + "reward": 3.9681310653686523, + "reward_std": 0.18027806282043457, + "rewards/reward_fn/mean": 3.9681310653686523, + "rewards/reward_fn/std": 0.18027807772159576, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 82.96875, + "completions/mean_terminated_length": 82.96875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.2275089502251992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.011403897835407406, + "learning_rate": 4.062e-06, + "loss": 0.0005, + "num_tokens": 45831570.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 80.65625, + "completions/mean_terminated_length": 80.65625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.22762443700196328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.007376368692348478, + "learning_rate": 4.059999999999999e-06, + "loss": 0.0003, + "num_tokens": 45851687.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 326.53125, + "completions/mean_terminated_length": 326.53125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.22773992377872734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.011080789583502337, + "learning_rate": 4.058e-06, + "loss": 0.0004, + "num_tokens": 45885784.0, + "reward": 3.2294187545776367, + "reward_std": 1.033329963684082, + "rewards/reward_fn/mean": 3.2294187545776367, + "rewards/reward_fn/std": 1.033329963684082, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 80.96875, + "completions/mean_terminated_length": 80.96875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.22785541055549138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.010518776343815261, + "learning_rate": 4.056e-06, + "loss": 0.0004, + "num_tokens": 45899959.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 120.53125, + "completions/mean_terminated_length": 120.53125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.22797089733225545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.012355384023976512, + "learning_rate": 4.0540000000000005e-06, + "loss": 0.0005, + "num_tokens": 45915080.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 159.9375, + "completions/mean_terminated_length": 159.9375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.22808638410901952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.0092496968281921, + "learning_rate": 4.0519999999999995e-06, + "loss": 0.0004, + "num_tokens": 45936934.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 70.5625, + "completions/mean_terminated_length": 70.5625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.2282018708857836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.00837068405780883, + "learning_rate": 4.049999999999999e-06, + "loss": 0.0003, + "num_tokens": 45950616.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 62.34375, + "completions/mean_terminated_length": 62.34375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.22831735766254763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1767578125, + "kl": 0.01782311608985765, + "learning_rate": 4.048e-06, + "loss": 0.0007, + "num_tokens": 45980707.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 83.5625, + "completions/mean_terminated_length": 83.5625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.2284328444393117, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.008566279389924603, + "learning_rate": 4.046e-06, + "loss": 0.0003, + "num_tokens": 46007349.0, + "reward": 3.9750747680664062, + "reward_std": 0.14099900424480438, + "rewards/reward_fn/mean": 3.9750747680664062, + "rewards/reward_fn/std": 0.1409989893436432, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 148.625, + "completions/mean_terminated_length": 148.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.22854833121607576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.01109502138569951, + "learning_rate": 4.044e-06, + "loss": 0.0004, + "num_tokens": 46031209.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 268.3125, + "completions/mean_terminated_length": 268.3125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.22866381799283983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0380859375, + "kl": 0.007800741892424412, + "learning_rate": 4.042e-06, + "loss": 0.0003, + "num_tokens": 46051507.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 427.75, + "completions/mean_terminated_length": 375.4838562011719, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.22877930476960387, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.01540960546117276, + "learning_rate": 4.0399999999999994e-06, + "loss": 0.0006, + "num_tokens": 46083307.0, + "reward": 3.3210482597351074, + "reward_std": 0.8531471490859985, + "rewards/reward_fn/mean": 3.3210482597351074, + "rewards/reward_fn/std": 0.8531472682952881, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 139.34375, + "completions/mean_terminated_length": 139.34375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.22889479154636794, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.012714505690382794, + "learning_rate": 4.038e-06, + "loss": 0.0005, + "num_tokens": 46106102.0, + "reward": 3.810666799545288, + "reward_std": 0.5689865946769714, + "rewards/reward_fn/mean": 3.810666799545288, + "rewards/reward_fn/std": 0.5689865946769714, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 200.375, + "completions/mean_terminated_length": 200.375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.229010278323132, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.012061250934493728, + "learning_rate": 4.035999999999999e-06, + "loss": 0.0005, + "num_tokens": 46128514.0, + "reward": 2.954342842102051, + "reward_std": 0.2039838284254074, + "rewards/reward_fn/mean": 2.954342842102051, + "rewards/reward_fn/std": 0.20398381352424622, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 308.28125, + "completions/mean_terminated_length": 308.28125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.22912576509989607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.010371312746428885, + "learning_rate": 4.034e-06, + "loss": 0.0004, + "num_tokens": 46147979.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 467.0, + "completions/mean_terminated_length": 467.0, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.2292412518766601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.01583337367628701, + "learning_rate": 4.032e-06, + "loss": 0.0006, + "num_tokens": 46184715.0, + "reward": 3.144080400466919, + "reward_std": 0.8845758438110352, + "rewards/reward_fn/mean": 3.144080400466919, + "rewards/reward_fn/std": 0.8845758438110352, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 127.4375, + "completions/mean_terminated_length": 127.4375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.22935673865342418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.011616733754635789, + "learning_rate": 4.03e-06, + "loss": 0.0005, + "num_tokens": 46208633.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.22947222543018825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.009760898683452979, + "learning_rate": 4.027999999999999e-06, + "loss": 0.0004, + "num_tokens": 46225545.0, + "reward": 3.79679012298584, + "reward_std": 0.3934653103351593, + "rewards/reward_fn/mean": 3.79679012298584, + "rewards/reward_fn/std": 0.3934653103351593, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 251.3125, + "completions/mean_terminated_length": 251.3125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.22958771220695232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.010445824504131451, + "learning_rate": 4.025999999999999e-06, + "loss": 0.0004, + "num_tokens": 46259539.0, + "reward": 3.927212953567505, + "reward_std": 0.4117460548877716, + "rewards/reward_fn/mean": 3.927212953567505, + "rewards/reward_fn/std": 0.41174599528312683, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 280.1875, + "completions/mean_terminated_length": 280.1875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.22970319898371636, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.012799466130672954, + "learning_rate": 4.024e-06, + "loss": 0.0005, + "num_tokens": 46289465.0, + "reward": 2.8441989421844482, + "reward_std": 0.49156275391578674, + "rewards/reward_fn/mean": 2.8441989421844482, + "rewards/reward_fn/std": 0.49156272411346436, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 159.6875, + "completions/mean_terminated_length": 159.6875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.22981868576048042, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.014068584569031373, + "learning_rate": 4.022e-06, + "loss": 0.0006, + "num_tokens": 46310927.0, + "reward": 3.299485921859741, + "reward_std": 0.4733685851097107, + "rewards/reward_fn/mean": 3.299485921859741, + "rewards/reward_fn/std": 0.4733685851097107, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 159.46875, + "completions/mean_terminated_length": 159.46875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.2299341725372445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.009657942566263955, + "learning_rate": 4.02e-06, + "loss": 0.0004, + "num_tokens": 46329246.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 180.03125, + "completions/mean_terminated_length": 180.03125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.23004965931400856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.012257806418347172, + "learning_rate": 4.0179999999999995e-06, + "loss": 0.0005, + "num_tokens": 46346367.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 174.59375, + "completions/mean_terminated_length": 174.59375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.2301651460907726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.01119643194397213, + "learning_rate": 4.016e-06, + "loss": 0.0004, + "num_tokens": 46374578.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 106.21875, + "completions/mean_terminated_length": 106.21875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.23028063286753667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.005566171450482216, + "learning_rate": 4.014e-06, + "loss": 0.0002, + "num_tokens": 46403001.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 201.84375, + "completions/mean_terminated_length": 201.84375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.23039611964430073, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.007572495022031944, + "learning_rate": 4.011999999999999e-06, + "loss": 0.0003, + "num_tokens": 46426932.0, + "reward": 3.981736183166504, + "reward_std": 0.10331536084413528, + "rewards/reward_fn/mean": 3.981736183166504, + "rewards/reward_fn/std": 0.1033153235912323, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 123.84375, + "completions/mean_terminated_length": 123.84375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.2305116064210648, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.010952012278721668, + "learning_rate": 4.01e-06, + "loss": 0.0004, + "num_tokens": 46450703.0, + "reward": 3.8931572437286377, + "reward_std": 0.22687768936157227, + "rewards/reward_fn/mean": 3.8931572437286377, + "rewards/reward_fn/std": 0.22687770426273346, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 269.46875, + "completions/mean_terminated_length": 269.46875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.23062709319782884, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.01117726627853699, + "learning_rate": 4.0079999999999996e-06, + "loss": 0.0004, + "num_tokens": 46470494.0, + "reward": 3.9273414611816406, + "reward_std": 0.4110182225704193, + "rewards/reward_fn/mean": 3.9273414611816406, + "rewards/reward_fn/std": 0.4110182523727417, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 61.875, + "completions/mean_terminated_length": 61.875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.2307425799745929, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.007914301633718424, + "learning_rate": 4.006e-06, + "loss": 0.0003, + "num_tokens": 46488218.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 405.28125, + "completions/mean_terminated_length": 405.28125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.23085806675135698, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.010987270041368902, + "learning_rate": 4.003999999999999e-06, + "loss": 0.0004, + "num_tokens": 46528803.0, + "reward": 3.4853451251983643, + "reward_std": 0.6319980621337891, + "rewards/reward_fn/mean": 3.4853451251983643, + "rewards/reward_fn/std": 0.6319980621337891, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 158.65625, + "completions/mean_terminated_length": 158.65625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.23097355352812102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.01360286351700779, + "learning_rate": 4.002e-06, + "loss": 0.0005, + "num_tokens": 46554232.0, + "reward": 3.954071283340454, + "reward_std": 0.18090969324111938, + "rewards/reward_fn/mean": 3.954071283340454, + "rewards/reward_fn/std": 0.1809096783399582, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 151.84375, + "completions/mean_terminated_length": 151.84375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.23108904030488508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.014150506176520139, + "learning_rate": 4e-06, + "loss": 0.0006, + "num_tokens": 46581203.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 63.28125, + "completions/mean_terminated_length": 63.28125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.23120452708164915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.008384020260564284, + "learning_rate": 3.998e-06, + "loss": 0.0003, + "num_tokens": 46591932.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 308.3125, + "completions/mean_terminated_length": 308.3125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.23132001385841322, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.009692385079688393, + "learning_rate": 3.9959999999999995e-06, + "loss": 0.0004, + "num_tokens": 46611494.0, + "reward": 3.930946111679077, + "reward_std": 0.3906276226043701, + "rewards/reward_fn/mean": 3.930946111679077, + "rewards/reward_fn/std": 0.39062759280204773, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 80.84375, + "completions/mean_terminated_length": 80.84375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.23143550063517726, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.484375, + "kl": 0.01439122674491955, + "learning_rate": 3.994e-06, + "loss": 0.0006, + "num_tokens": 46627713.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 249.5625, + "completions/mean_terminated_length": 249.5625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.23155098741194133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04345703125, + "kl": 0.008404701358813327, + "learning_rate": 3.992e-06, + "loss": 0.0003, + "num_tokens": 46645363.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 139.84375, + "completions/mean_terminated_length": 139.84375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.2316664741887054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.008119915633869823, + "learning_rate": 3.99e-06, + "loss": 0.0003, + "num_tokens": 46666126.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 128.9375, + "completions/mean_terminated_length": 128.9375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.23178196096546946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.010902547495788895, + "learning_rate": 3.988e-06, + "loss": 0.0004, + "num_tokens": 46682924.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 190.59375, + "completions/mean_terminated_length": 190.59375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.2318974477422335, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.009653917571995407, + "learning_rate": 3.986e-06, + "loss": 0.0004, + "num_tokens": 46711295.0, + "reward": 3.9434592723846436, + "reward_std": 0.22283923625946045, + "rewards/reward_fn/mean": 3.9434592723846436, + "rewards/reward_fn/std": 0.22283923625946045, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 239.1875, + "completions/mean_terminated_length": 239.1875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.23201293451899757, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.011065623737522401, + "learning_rate": 3.9839999999999995e-06, + "loss": 0.0004, + "num_tokens": 46737925.0, + "reward": 2.8211002349853516, + "reward_std": 0.22026962041854858, + "rewards/reward_fn/mean": 2.8211002349853516, + "rewards/reward_fn/std": 0.220269575715065, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 73.59375, + "completions/mean_terminated_length": 73.59375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.23212842129576164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.006965254062379245, + "learning_rate": 3.982e-06, + "loss": 0.0003, + "num_tokens": 46761368.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 73.65625, + "completions/mean_terminated_length": 73.65625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.2322439080725257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.004495779523495003, + "learning_rate": 3.98e-06, + "loss": 0.0002, + "num_tokens": 46791885.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 332.375, + "completions/mean_terminated_length": 332.375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.23235939484928975, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.009410411264980212, + "learning_rate": 3.978e-06, + "loss": 0.0004, + "num_tokens": 46817657.0, + "reward": 3.929215908050537, + "reward_std": 0.4004152715206146, + "rewards/reward_fn/mean": 3.929215908050537, + "rewards/reward_fn/std": 0.40041524171829224, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 107.0, + "completions/mean_terminated_length": 107.0, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.23247488162605381, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.009147419390501454, + "learning_rate": 3.976e-06, + "loss": 0.0004, + "num_tokens": 46829049.0, + "reward": 3.9788925647735596, + "reward_std": 0.11940151453018188, + "rewards/reward_fn/mean": 3.9788925647735596, + "rewards/reward_fn/std": 0.1194014698266983, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 172.53125, + "completions/mean_terminated_length": 172.53125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.23259036840281788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.010127753143024165, + "learning_rate": 3.974e-06, + "loss": 0.0004, + "num_tokens": 46845066.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 245.5625, + "completions/mean_terminated_length": 245.5625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.23270585517958195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.011450589590822347, + "learning_rate": 3.971999999999999e-06, + "loss": 0.0005, + "num_tokens": 46865468.0, + "reward": 3.7904157638549805, + "reward_std": 0.6620621085166931, + "rewards/reward_fn/mean": 3.7904157638549805, + "rewards/reward_fn/std": 0.6620621085166931, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 162.28125, + "completions/mean_terminated_length": 162.28125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.232821341956346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.011488426956930198, + "learning_rate": 3.97e-06, + "loss": 0.0005, + "num_tokens": 46888741.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 224.28125, + "completions/mean_terminated_length": 224.28125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.23293682873311006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.013895016978494823, + "learning_rate": 3.968e-06, + "loss": 0.0006, + "num_tokens": 46913742.0, + "reward": 3.6354568004608154, + "reward_std": 0.8356758952140808, + "rewards/reward_fn/mean": 3.6354568004608154, + "rewards/reward_fn/std": 0.835675835609436, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 118.28125, + "completions/mean_terminated_length": 118.28125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.23305231550987413, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.012675916717853397, + "learning_rate": 3.966e-06, + "loss": 0.0005, + "num_tokens": 46936439.0, + "reward": 3.9722187519073486, + "reward_std": 0.15715454518795013, + "rewards/reward_fn/mean": 3.9722187519073486, + "rewards/reward_fn/std": 0.15715454518795013, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 76.5625, + "completions/mean_terminated_length": 76.5625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.2331678022866382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1806640625, + "kl": 0.013167593206162564, + "learning_rate": 3.964e-06, + "loss": 0.0005, + "num_tokens": 46962089.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 354.09375, + "completions/mean_terminated_length": 354.09375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.23328328906340223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.011586226581130177, + "learning_rate": 3.962e-06, + "loss": 0.0005, + "num_tokens": 46989644.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1090.0, + "completions/max_terminated_length": 1090.0, + "completions/mean_length": 442.125, + "completions/mean_terminated_length": 442.125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.2333987758401663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034912109375, + "kl": 0.007961359311593696, + "learning_rate": 3.96e-06, + "loss": 0.0003, + "num_tokens": 47013552.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 211.1875, + "completions/mean_terminated_length": 211.1875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.23351426261693037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.009272643088479526, + "learning_rate": 3.958e-06, + "loss": 0.0004, + "num_tokens": 47039126.0, + "reward": 3.698751926422119, + "reward_std": 0.5883362293243408, + "rewards/reward_fn/mean": 3.698751926422119, + "rewards/reward_fn/std": 0.5883362293243408, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 112.8125, + "completions/mean_terminated_length": 112.8125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.23362974939369444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.011870950314914808, + "learning_rate": 3.956e-06, + "loss": 0.0005, + "num_tokens": 47067344.0, + "reward": 3.934123992919922, + "reward_std": 0.37265172600746155, + "rewards/reward_fn/mean": 3.934123992919922, + "rewards/reward_fn/std": 0.37265172600746155, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 268.78125, + "completions/mean_terminated_length": 268.78125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.23374523617045848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.010962482156173792, + "learning_rate": 3.954e-06, + "loss": 0.0004, + "num_tokens": 47087881.0, + "reward": 3.8716180324554443, + "reward_std": 0.45202842354774475, + "rewards/reward_fn/mean": 3.8716180324554443, + "rewards/reward_fn/std": 0.4520283639431, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 114.0625, + "completions/mean_terminated_length": 114.0625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.23386072294722254, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.015984026540536433, + "learning_rate": 3.952e-06, + "loss": 0.0006, + "num_tokens": 47099659.0, + "reward": 3.566680431365967, + "reward_std": 0.3215753734111786, + "rewards/reward_fn/mean": 3.566680431365967, + "rewards/reward_fn/std": 0.3215753734111786, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 73.3125, + "completions/mean_terminated_length": 73.3125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.2339762097239866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.01272988291020738, + "learning_rate": 3.95e-06, + "loss": 0.0005, + "num_tokens": 47123797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 152.1875, + "completions/mean_terminated_length": 152.1875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.23409169650075065, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.011974501176155172, + "learning_rate": 3.948e-06, + "loss": 0.0005, + "num_tokens": 47141275.0, + "reward": 3.9654884338378906, + "reward_std": 0.19522765278816223, + "rewards/reward_fn/mean": 3.9654884338378906, + "rewards/reward_fn/std": 0.19522765278816223, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 218.21875, + "completions/mean_terminated_length": 218.21875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.23420718327751472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.013354567301576026, + "learning_rate": 3.946e-06, + "loss": 0.0005, + "num_tokens": 47160738.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 205.5, + "completions/mean_terminated_length": 205.5, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.2343226700542788, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.011123651296657044, + "learning_rate": 3.944e-06, + "loss": 0.0004, + "num_tokens": 47182930.0, + "reward": 3.9321529865264893, + "reward_std": 0.3838009536266327, + "rewards/reward_fn/mean": 3.9321529865264893, + "rewards/reward_fn/std": 0.3838009536266327, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 410.03125, + "completions/mean_terminated_length": 410.03125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.23443815683104285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.008098128193523735, + "learning_rate": 3.942e-06, + "loss": 0.0003, + "num_tokens": 47205907.0, + "reward": 3.92915940284729, + "reward_std": 0.40073463320732117, + "rewards/reward_fn/mean": 3.92915940284729, + "rewards/reward_fn/std": 0.40073463320732117, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 176.0, + "completions/mean_terminated_length": 176.0, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.2345536436078069, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.01027455521398224, + "learning_rate": 3.9399999999999995e-06, + "loss": 0.0004, + "num_tokens": 47230547.0, + "reward": 3.902977705001831, + "reward_std": 0.413899302482605, + "rewards/reward_fn/mean": 3.902977705001831, + "rewards/reward_fn/std": 0.4138992428779602, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 69.40625, + "completions/mean_terminated_length": 69.40625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.23466913038457096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.008912480720027816, + "learning_rate": 3.938e-06, + "loss": 0.0004, + "num_tokens": 47260064.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 105.59375, + "completions/mean_terminated_length": 105.59375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.23478461716133503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.008567257529648487, + "learning_rate": 3.936e-06, + "loss": 0.0003, + "num_tokens": 47277171.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 219.21875, + "completions/mean_terminated_length": 219.21875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.2349001039380991, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.01568928021879401, + "learning_rate": 3.934e-06, + "loss": 0.0006, + "num_tokens": 47309914.0, + "reward": 3.9253134727478027, + "reward_std": 0.24275310337543488, + "rewards/reward_fn/mean": 3.9253134727478027, + "rewards/reward_fn/std": 0.24275313317775726, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 192.65625, + "completions/mean_terminated_length": 192.65625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.23501559071486314, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.01960209006210789, + "learning_rate": 3.932e-06, + "loss": 0.0008, + "num_tokens": 47335855.0, + "reward": 3.7528858184814453, + "reward_std": 0.4023014307022095, + "rewards/reward_fn/mean": 3.7528858184814453, + "rewards/reward_fn/std": 0.4023014008998871, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 395.96875, + "completions/mean_terminated_length": 395.96875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.2351310774916272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03564453125, + "kl": 0.00812143633083906, + "learning_rate": 3.93e-06, + "loss": 0.0003, + "num_tokens": 47358382.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 258.4375, + "completions/mean_terminated_length": 258.4375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.23524656426839127, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.015801118221133947, + "learning_rate": 3.9279999999999995e-06, + "loss": 0.0006, + "num_tokens": 47376316.0, + "reward": 2.8044238090515137, + "reward_std": 0.3347412645816803, + "rewards/reward_fn/mean": 2.8044238090515137, + "rewards/reward_fn/std": 0.3347412049770355, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 248.03125, + "completions/mean_terminated_length": 248.03125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.23536205104515534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.01108857081271708, + "learning_rate": 3.926e-06, + "loss": 0.0004, + "num_tokens": 47405725.0, + "reward": 3.857527256011963, + "reward_std": 0.33711904287338257, + "rewards/reward_fn/mean": 3.857527256011963, + "rewards/reward_fn/std": 0.3371190130710602, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 154.4375, + "completions/mean_terminated_length": 154.4375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.23547753782191938, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.009025197417940944, + "learning_rate": 3.924e-06, + "loss": 0.0004, + "num_tokens": 47422539.0, + "reward": 3.985321044921875, + "reward_std": 0.08303705602884293, + "rewards/reward_fn/mean": 3.985321044921875, + "rewards/reward_fn/std": 0.08303708583116531, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 122.84375, + "completions/mean_terminated_length": 122.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.23559302459868345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.004616817117494065, + "learning_rate": 3.922e-06, + "loss": 0.0002, + "num_tokens": 47450054.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 175.34375, + "completions/mean_terminated_length": 175.34375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.23570851137544752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.010158749166294001, + "learning_rate": 3.92e-06, + "loss": 0.0004, + "num_tokens": 47479825.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 115.28125, + "completions/mean_terminated_length": 115.28125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.23582399815221158, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.012353405079920776, + "learning_rate": 3.918e-06, + "loss": 0.0005, + "num_tokens": 47497562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 103.0, + "completions/mean_length": 73.4375, + "completions/mean_terminated_length": 73.4375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.23593948492897562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051513671875, + "kl": 0.003569877128029475, + "learning_rate": 3.9159999999999994e-06, + "loss": 0.0001, + "num_tokens": 47519048.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 56.71875, + "completions/mean_terminated_length": 56.71875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.2360549717057397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1728515625, + "kl": 0.011935444788832683, + "learning_rate": 3.914e-06, + "loss": 0.0005, + "num_tokens": 47544063.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 59.5625, + "completions/mean_terminated_length": 59.5625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.23617045848250376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.255859375, + "kl": 0.020035176137753297, + "learning_rate": 3.912e-06, + "loss": 0.0008, + "num_tokens": 47567697.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 121.53125, + "completions/mean_terminated_length": 121.53125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.23628594525926783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.01077533364878036, + "learning_rate": 3.91e-06, + "loss": 0.0004, + "num_tokens": 47592834.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 133.9375, + "completions/mean_terminated_length": 133.9375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.23640143203603187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.014243768528103828, + "learning_rate": 3.908e-06, + "loss": 0.0006, + "num_tokens": 47610624.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 134.875, + "completions/mean_terminated_length": 134.875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.23651691881279593, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.01119245718291495, + "learning_rate": 3.906e-06, + "loss": 0.0004, + "num_tokens": 47625788.0, + "reward": 3.6014204025268555, + "reward_std": 0.8113340139389038, + "rewards/reward_fn/mean": 3.6014204025268555, + "rewards/reward_fn/std": 0.811333954334259, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 60.53125, + "completions/mean_terminated_length": 60.53125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.23663240558956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.012300556350965053, + "learning_rate": 3.903999999999999e-06, + "loss": 0.0005, + "num_tokens": 47640909.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 90.09375, + "completions/mean_terminated_length": 90.09375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.23674789236632407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.007183422761954716, + "learning_rate": 3.902e-06, + "loss": 0.0003, + "num_tokens": 47655984.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 290.03125, + "completions/mean_terminated_length": 290.03125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.2368633791430881, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.010818405062309466, + "learning_rate": 3.9e-06, + "loss": 0.0004, + "num_tokens": 47680081.0, + "reward": 3.714641571044922, + "reward_std": 0.7672418355941772, + "rewards/reward_fn/mean": 3.714641571044922, + "rewards/reward_fn/std": 0.7672418355941772, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 353.75, + "completions/mean_terminated_length": 353.75, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.23697886591985218, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.010759265293017961, + "learning_rate": 3.898e-06, + "loss": 0.0004, + "num_tokens": 47710825.0, + "reward": 2.798365831375122, + "reward_std": 0.24101287126541138, + "rewards/reward_fn/mean": 2.798365831375122, + "rewards/reward_fn/std": 0.24101288616657257, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 106.15625, + "completions/mean_terminated_length": 106.15625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.23709435269661625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.010017317239544354, + "learning_rate": 3.896e-06, + "loss": 0.0004, + "num_tokens": 47736046.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 110.3125, + "completions/mean_terminated_length": 110.3125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.23720983947338029, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.009924680278345477, + "learning_rate": 3.894e-06, + "loss": 0.0004, + "num_tokens": 47761816.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 151.78125, + "completions/mean_terminated_length": 151.78125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.23732532625014435, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.018682407215237617, + "learning_rate": 3.891999999999999e-06, + "loss": 0.0007, + "num_tokens": 47780529.0, + "reward": 3.9484634399414062, + "reward_std": 0.2028004229068756, + "rewards/reward_fn/mean": 3.9484634399414062, + "rewards/reward_fn/std": 0.2028004229068756, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 202.75, + "completions/mean_terminated_length": 202.75, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.23744081302690842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.012354275182588026, + "learning_rate": 3.89e-06, + "loss": 0.0005, + "num_tokens": 47807305.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 216.09375, + "completions/mean_terminated_length": 216.09375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.2375562998036725, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.012197255142382346, + "learning_rate": 3.888e-06, + "loss": 0.0005, + "num_tokens": 47827788.0, + "reward": 3.7202208042144775, + "reward_std": 0.4585617184638977, + "rewards/reward_fn/mean": 3.7202208042144775, + "rewards/reward_fn/std": 0.4585616886615753, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 68.625, + "completions/mean_terminated_length": 68.625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.23767178658043653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.006703798680973705, + "learning_rate": 3.886e-06, + "loss": 0.0003, + "num_tokens": 47845888.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 95.53125, + "completions/mean_terminated_length": 95.53125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.2377872733572006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.0176811905548675, + "learning_rate": 3.8839999999999996e-06, + "loss": 0.0007, + "num_tokens": 47867921.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 301.28125, + "completions/mean_terminated_length": 301.28125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.23790276013396466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.010049667689600028, + "learning_rate": 3.882e-06, + "loss": 0.0004, + "num_tokens": 47905690.0, + "reward": 3.6863646507263184, + "reward_std": 0.6993573307991028, + "rewards/reward_fn/mean": 3.6863646507263184, + "rewards/reward_fn/std": 0.6993573307991028, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 162.71875, + "completions/mean_terminated_length": 162.71875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.23801824691072873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.009360619238577783, + "learning_rate": 3.88e-06, + "loss": 0.0004, + "num_tokens": 47934033.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 126.21875, + "completions/mean_terminated_length": 126.21875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.23813373368749277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.007336169881455135, + "learning_rate": 3.878e-06, + "loss": 0.0003, + "num_tokens": 47946200.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 106.78125, + "completions/mean_terminated_length": 106.78125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.23824922046425684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.00566859881291748, + "learning_rate": 3.876e-06, + "loss": 0.0002, + "num_tokens": 47975089.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 164.0, + "completions/mean_terminated_length": 164.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.2383647072410209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051513671875, + "kl": 0.007750730132102035, + "learning_rate": 3.874e-06, + "loss": 0.0003, + "num_tokens": 48004401.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 153.03125, + "completions/mean_terminated_length": 153.03125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.23848019401778497, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.01146718705422245, + "learning_rate": 3.8719999999999995e-06, + "loss": 0.0005, + "num_tokens": 48033554.0, + "reward": 3.9280810356140137, + "reward_std": 0.4068344533443451, + "rewards/reward_fn/mean": 3.9280810356140137, + "rewards/reward_fn/std": 0.4068344533443451, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 87.8125, + "completions/mean_terminated_length": 87.8125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.23859568079454901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.008472191235341597, + "learning_rate": 3.87e-06, + "loss": 0.0003, + "num_tokens": 48052108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 57.34375, + "completions/mean_terminated_length": 57.34375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.23871116757131308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2236328125, + "kl": 0.014600794536818285, + "learning_rate": 3.868e-06, + "loss": 0.0006, + "num_tokens": 48076311.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 153.3125, + "completions/mean_terminated_length": 153.3125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.23882665434807715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.014714031596668065, + "learning_rate": 3.866e-06, + "loss": 0.0006, + "num_tokens": 48094273.0, + "reward": 3.879469394683838, + "reward_std": 0.3241654932498932, + "rewards/reward_fn/mean": 3.879469394683838, + "rewards/reward_fn/std": 0.3241654634475708, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 253.25, + "completions/mean_terminated_length": 253.25, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.23894214112484122, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.015097276860615239, + "learning_rate": 3.864e-06, + "loss": 0.0006, + "num_tokens": 48117417.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 242.8125, + "completions/mean_terminated_length": 242.8125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.23905762790160526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.007669833859836217, + "learning_rate": 3.862e-06, + "loss": 0.0003, + "num_tokens": 48141411.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 221.3125, + "completions/mean_terminated_length": 221.3125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.23917311467836933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.007771076285280287, + "learning_rate": 3.8599999999999995e-06, + "loss": 0.0003, + "num_tokens": 48171309.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 155.1875, + "completions/mean_terminated_length": 155.1875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.2392886014551334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.008907798845029902, + "learning_rate": 3.858e-06, + "loss": 0.0004, + "num_tokens": 48188275.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 135.34375, + "completions/mean_terminated_length": 135.34375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.23940408823189746, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.007750740274786949, + "learning_rate": 3.856e-06, + "loss": 0.0003, + "num_tokens": 48210302.0, + "reward": 2.955704689025879, + "reward_std": 0.3428187668323517, + "rewards/reward_fn/mean": 2.955704689025879, + "rewards/reward_fn/std": 0.34281882643699646, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 224.65625, + "completions/mean_terminated_length": 224.65625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.2395195750086615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.011443427647463977, + "learning_rate": 3.854e-06, + "loss": 0.0005, + "num_tokens": 48228275.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1118.0, + "completions/max_terminated_length": 1118.0, + "completions/mean_length": 357.28125, + "completions/mean_terminated_length": 357.28125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.23963506178542557, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.012307632219744846, + "learning_rate": 3.852e-06, + "loss": 0.0005, + "num_tokens": 48251676.0, + "reward": 3.8805007934570312, + "reward_std": 0.3217354416847229, + "rewards/reward_fn/mean": 3.8805007934570312, + "rewards/reward_fn/std": 0.3217354714870453, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 179.46875, + "completions/mean_terminated_length": 179.46875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.23975054856218964, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.027581132511841133, + "learning_rate": 3.8499999999999996e-06, + "loss": 0.0011, + "num_tokens": 48270699.0, + "reward": 3.9692773818969727, + "reward_std": 0.17379257082939148, + "rewards/reward_fn/mean": 3.9692773818969727, + "rewards/reward_fn/std": 0.17379257082939148, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1099.0, + "completions/max_terminated_length": 1099.0, + "completions/mean_length": 397.53125, + "completions/mean_terminated_length": 397.53125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.2398660353389537, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.011104219433036633, + "learning_rate": 3.847999999999999e-06, + "loss": 0.0004, + "num_tokens": 48301340.0, + "reward": 3.002673625946045, + "reward_std": 0.6486151814460754, + "rewards/reward_fn/mean": 3.002673625946045, + "rewards/reward_fn/std": 0.6486151218414307, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 88.6875, + "completions/mean_terminated_length": 88.6875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.23998152211571774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.009649980114772916, + "learning_rate": 3.846e-06, + "loss": 0.0004, + "num_tokens": 48321394.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 72.0, + "completions/mean_terminated_length": 72.0, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.2400970088924818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.009862048907962162, + "learning_rate": 3.844e-06, + "loss": 0.0004, + "num_tokens": 48338450.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 138.46875, + "completions/mean_terminated_length": 138.46875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.24021249566924588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.00904744463332463, + "learning_rate": 3.842e-06, + "loss": 0.0004, + "num_tokens": 48365505.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 251.15625, + "completions/mean_terminated_length": 251.15625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.24032798244600992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.00949185684294207, + "learning_rate": 3.84e-06, + "loss": 0.0004, + "num_tokens": 48383238.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 143.125, + "completions/mean_terminated_length": 143.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.240443469222774, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "kl": 0.007033891510218382, + "learning_rate": 3.8379999999999995e-06, + "loss": 0.0003, + "num_tokens": 48401450.0, + "reward": 3.9300107955932617, + "reward_std": 0.3959193527698517, + "rewards/reward_fn/mean": 3.9300107955932617, + "rewards/reward_fn/std": 0.3959193527698517, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 114.96875, + "completions/mean_terminated_length": 114.96875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.24055895599953805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.006121396290836856, + "learning_rate": 3.835999999999999e-06, + "loss": 0.0002, + "num_tokens": 48421737.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 155.8125, + "completions/mean_terminated_length": 155.8125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.24067444277630212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.014821760283666663, + "learning_rate": 3.834e-06, + "loss": 0.0006, + "num_tokens": 48448579.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 111.5625, + "completions/mean_terminated_length": 111.5625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.24078992955306616, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.71875, + "kl": 0.007329414467676543, + "learning_rate": 3.832e-06, + "loss": 0.0003, + "num_tokens": 48473461.0, + "reward": 3.964320182800293, + "reward_std": 0.20183467864990234, + "rewards/reward_fn/mean": 3.964320182800293, + "rewards/reward_fn/std": 0.20183463394641876, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 207.09375, + "completions/mean_terminated_length": 207.09375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.24090541632983023, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0079879958066158, + "learning_rate": 3.83e-06, + "loss": 0.0003, + "num_tokens": 48498104.0, + "reward": 3.9305553436279297, + "reward_std": 0.22413145005702972, + "rewards/reward_fn/mean": 3.9305553436279297, + "rewards/reward_fn/std": 0.22413145005702972, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 137.78125, + "completions/mean_terminated_length": 137.78125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.2410209031065943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.007488032730179839, + "learning_rate": 3.828e-06, + "loss": 0.0003, + "num_tokens": 48524913.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 128.28125, + "completions/mean_terminated_length": 128.28125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.24113638988335837, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.012490909022744745, + "learning_rate": 3.826e-06, + "loss": 0.0005, + "num_tokens": 48541882.0, + "reward": 3.923588752746582, + "reward_std": 0.3009818494319916, + "rewards/reward_fn/mean": 3.923588752746582, + "rewards/reward_fn/std": 0.3009818196296692, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 297.84375, + "completions/mean_terminated_length": 297.84375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.2412518766601224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.009138054243521765, + "learning_rate": 3.823999999999999e-06, + "loss": 0.0004, + "num_tokens": 48566005.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 80.40625, + "completions/mean_terminated_length": 80.40625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.24136736343688647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.0066692583641270176, + "learning_rate": 3.822e-06, + "loss": 0.0003, + "num_tokens": 48586946.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 277.53125, + "completions/mean_terminated_length": 277.53125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.24148285021365054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.00902494190086145, + "learning_rate": 3.82e-06, + "loss": 0.0004, + "num_tokens": 48610963.0, + "reward": 3.930070400238037, + "reward_std": 0.3955814838409424, + "rewards/reward_fn/mean": 3.930070400238037, + "rewards/reward_fn/std": 0.39558145403862, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 192.0, + "completions/mean_terminated_length": 192.0, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.2415983369904146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.011463428076240234, + "learning_rate": 3.818e-06, + "loss": 0.0005, + "num_tokens": 48629459.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 65.46875, + "completions/mean_terminated_length": 65.46875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.24171382376717865, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.828125, + "kl": 0.0146784851240227, + "learning_rate": 3.8159999999999995e-06, + "loss": 0.0006, + "num_tokens": 48649314.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 440.5, + "completions/mean_terminated_length": 388.6451416015625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.24182931054394272, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.0103362692243536, + "learning_rate": 3.814e-06, + "loss": 0.0004, + "num_tokens": 48674578.0, + "reward": 2.8364784717559814, + "reward_std": 0.9508607983589172, + "rewards/reward_fn/mean": 2.8364784717559814, + "rewards/reward_fn/std": 0.9508607983589172, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 151.4375, + "completions/mean_terminated_length": 151.4375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.24194479732070678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.009171366516966373, + "learning_rate": 3.8119999999999997e-06, + "loss": 0.0004, + "num_tokens": 48701856.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 281.125, + "completions/mean_terminated_length": 281.125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.24206028409747085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.016948432064964436, + "learning_rate": 3.81e-06, + "loss": 0.0007, + "num_tokens": 48728132.0, + "reward": 3.059288501739502, + "reward_std": 0.0693935975432396, + "rewards/reward_fn/mean": 3.059288501739502, + "rewards/reward_fn/std": 0.06939360499382019, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 58.25, + "completions/mean_terminated_length": 58.25, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.2421757708742349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2353515625, + "kl": 0.015652379806851968, + "learning_rate": 3.808e-06, + "loss": 0.0006, + "num_tokens": 48750700.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 222.46875, + "completions/mean_terminated_length": 222.46875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.24229125765099896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.017115997412474826, + "learning_rate": 3.806e-06, + "loss": 0.0007, + "num_tokens": 48769659.0, + "reward": 3.8024604320526123, + "reward_std": 0.3929992914199829, + "rewards/reward_fn/mean": 3.8024604320526123, + "rewards/reward_fn/std": 0.3929992914199829, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 164.9375, + "completions/mean_terminated_length": 164.9375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.24240674442776303, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.009965345016098581, + "learning_rate": 3.8039999999999995e-06, + "loss": 0.0004, + "num_tokens": 48798297.0, + "reward": 3.9042725563049316, + "reward_std": 0.4115632176399231, + "rewards/reward_fn/mean": 3.9042725563049316, + "rewards/reward_fn/std": 0.4115631878376007, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 76.28125, + "completions/mean_terminated_length": 76.28125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.2425222312045271, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.28125, + "kl": 0.007482433371478692, + "learning_rate": 3.8019999999999998e-06, + "loss": 0.0003, + "num_tokens": 48819778.0, + "reward": 3.972982406616211, + "reward_std": 0.15283527970314026, + "rewards/reward_fn/mean": 3.972982406616211, + "rewards/reward_fn/std": 0.15283530950546265, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 174.71875, + "completions/mean_terminated_length": 174.71875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.24263771798129113, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.008845246811688412, + "learning_rate": 3.7999999999999996e-06, + "loss": 0.0004, + "num_tokens": 48838169.0, + "reward": 3.9669013023376465, + "reward_std": 0.18723423779010773, + "rewards/reward_fn/mean": 3.9669013023376465, + "rewards/reward_fn/std": 0.18723425269126892, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 372.53125, + "completions/mean_terminated_length": 372.53125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.2427532047580552, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.011698362286551856, + "learning_rate": 3.798e-06, + "loss": 0.0005, + "num_tokens": 48861610.0, + "reward": 3.4314699172973633, + "reward_std": 1.0006150007247925, + "rewards/reward_fn/mean": 3.4314699172973633, + "rewards/reward_fn/std": 1.0006150007247925, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 94.5, + "completions/mean_terminated_length": 94.5, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.24286869153481927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.009870139219856355, + "learning_rate": 3.7959999999999997e-06, + "loss": 0.0004, + "num_tokens": 48874042.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 193.5625, + "completions/mean_terminated_length": 193.5625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.24298417831158334, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.013721337920287624, + "learning_rate": 3.794e-06, + "loss": 0.0005, + "num_tokens": 48893772.0, + "reward": 3.857837200164795, + "reward_std": 0.5594173073768616, + "rewards/reward_fn/mean": 3.857837200164795, + "rewards/reward_fn/std": 0.5594173073768616, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 144.71875, + "completions/mean_terminated_length": 144.71875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.24309966508834738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.013346998952329159, + "learning_rate": 3.7919999999999994e-06, + "loss": 0.0005, + "num_tokens": 48916835.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 226.875, + "completions/mean_terminated_length": 226.875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.24321515186511145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.012692025673459284, + "learning_rate": 3.7899999999999997e-06, + "loss": 0.0005, + "num_tokens": 48936383.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 115.15625, + "completions/mean_terminated_length": 115.15625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.2433306386418755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.005429142482171301, + "learning_rate": 3.7879999999999996e-06, + "loss": 0.0002, + "num_tokens": 48959716.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 245.3125, + "completions/mean_terminated_length": 245.3125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.24344612541863955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044921875, + "kl": 0.007856388525397051, + "learning_rate": 3.786e-06, + "loss": 0.0003, + "num_tokens": 48983598.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 304.65625, + "completions/mean_terminated_length": 304.65625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.24356161219540362, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.016574221619521268, + "learning_rate": 3.7839999999999997e-06, + "loss": 0.0007, + "num_tokens": 49017123.0, + "reward": 3.331512689590454, + "reward_std": 0.5282212495803833, + "rewards/reward_fn/mean": 3.331512689590454, + "rewards/reward_fn/std": 0.5282212495803833, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 214.0625, + "completions/mean_terminated_length": 214.0625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.2436770989721677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.01138556539081037, + "learning_rate": 3.782e-06, + "loss": 0.0005, + "num_tokens": 49035909.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 168.375, + "completions/mean_terminated_length": 168.375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.24379258574893176, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.01318050708505325, + "learning_rate": 3.78e-06, + "loss": 0.0005, + "num_tokens": 49053265.0, + "reward": 3.7398529052734375, + "reward_std": 0.5401102304458618, + "rewards/reward_fn/mean": 3.7398529052734375, + "rewards/reward_fn/std": 0.5401102304458618, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 252.71875, + "completions/mean_terminated_length": 252.71875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.2439080725256958, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.013014154130360112, + "learning_rate": 3.7779999999999997e-06, + "loss": 0.0005, + "num_tokens": 49070408.0, + "reward": 3.7926535606384277, + "reward_std": 0.6549919843673706, + "rewards/reward_fn/mean": 3.7926535606384277, + "rewards/reward_fn/std": 0.6549921035766602, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 94.9375, + "completions/mean_terminated_length": 94.9375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.24402355930245986, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "kl": 0.03120318948640488, + "learning_rate": 3.7759999999999995e-06, + "loss": 0.0012, + "num_tokens": 49088294.0, + "reward": 3.8571817874908447, + "reward_std": 0.3375813663005829, + "rewards/reward_fn/mean": 3.8571817874908447, + "rewards/reward_fn/std": 0.3375813364982605, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 204.59375, + "completions/mean_terminated_length": 204.59375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.24413904607922393, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.010625824048474897, + "learning_rate": 3.7739999999999998e-06, + "loss": 0.0004, + "num_tokens": 49117881.0, + "reward": 3.482614278793335, + "reward_std": 0.4957907497882843, + "rewards/reward_fn/mean": 3.482614278793335, + "rewards/reward_fn/std": 0.4957907199859619, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 243.84375, + "completions/mean_terminated_length": 243.84375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.244254532855988, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.011604304105276242, + "learning_rate": 3.7719999999999996e-06, + "loss": 0.0005, + "num_tokens": 49146708.0, + "reward": 3.3920092582702637, + "reward_std": 0.35779353976249695, + "rewards/reward_fn/mean": 3.3920092582702637, + "rewards/reward_fn/std": 0.35779353976249695, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 71.40625, + "completions/mean_terminated_length": 71.40625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.24437001963275204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.006529231206513941, + "learning_rate": 3.77e-06, + "loss": 0.0003, + "num_tokens": 49174369.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 96.875, + "completions/mean_terminated_length": 96.875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.2444855064095161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.009211162367137149, + "learning_rate": 3.7679999999999998e-06, + "loss": 0.0004, + "num_tokens": 49189693.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 488.9375, + "completions/mean_terminated_length": 488.9375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.24460099318628017, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.010360269006923772, + "learning_rate": 3.766e-06, + "loss": 0.0004, + "num_tokens": 49222971.0, + "reward": 3.8599376678466797, + "reward_std": 0.5511667728424072, + "rewards/reward_fn/mean": 3.8599376678466797, + "rewards/reward_fn/std": 0.5511667728424072, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 113.40625, + "completions/mean_terminated_length": 113.40625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.24471647996304424, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.010030887628090568, + "learning_rate": 3.7639999999999995e-06, + "loss": 0.0004, + "num_tokens": 49242440.0, + "reward": 3.97698974609375, + "reward_std": 0.130166158080101, + "rewards/reward_fn/mean": 3.97698974609375, + "rewards/reward_fn/std": 0.13016611337661743, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 176.34375, + "completions/mean_terminated_length": 176.34375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.24483196673980828, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.010087779206514824, + "learning_rate": 3.7619999999999997e-06, + "loss": 0.0004, + "num_tokens": 49267251.0, + "reward": 3.9728801250457764, + "reward_std": 0.1534128338098526, + "rewards/reward_fn/mean": 3.9728801250457764, + "rewards/reward_fn/std": 0.1534128040075302, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 107.21875, + "completions/mean_terminated_length": 107.21875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.24494745351657235, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.013244896916148718, + "learning_rate": 3.7599999999999996e-06, + "loss": 0.0005, + "num_tokens": 49284058.0, + "reward": 3.9665699005126953, + "reward_std": 0.18910850584506989, + "rewards/reward_fn/mean": 3.9665699005126953, + "rewards/reward_fn/std": 0.1891084760427475, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 241.71875, + "completions/mean_terminated_length": 241.71875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.24506294029333642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.022577384836040437, + "learning_rate": 3.758e-06, + "loss": 0.0009, + "num_tokens": 49315409.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 201.9375, + "completions/mean_terminated_length": 201.9375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.24517842707010049, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.011556057666894048, + "learning_rate": 3.7559999999999997e-06, + "loss": 0.0005, + "num_tokens": 49345071.0, + "reward": 3.923373222351074, + "reward_std": 0.24368223547935486, + "rewards/reward_fn/mean": 3.923373222351074, + "rewards/reward_fn/std": 0.24368219077587128, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 88.03125, + "completions/mean_terminated_length": 88.03125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.24529391384686453, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.01089113908165018, + "learning_rate": 3.754e-06, + "loss": 0.0004, + "num_tokens": 49357328.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 223.9375, + "completions/mean_terminated_length": 223.9375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.2454094006236286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.012930262106237933, + "learning_rate": 3.7519999999999994e-06, + "loss": 0.0005, + "num_tokens": 49383438.0, + "reward": 3.0548105239868164, + "reward_std": 0.05952825769782066, + "rewards/reward_fn/mean": 3.0548105239868164, + "rewards/reward_fn/std": 0.059528250247240067, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 77.0625, + "completions/mean_terminated_length": 77.0625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.24552488740039266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.0073769867813098244, + "learning_rate": 3.7499999999999997e-06, + "loss": 0.0003, + "num_tokens": 49410640.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 195.6875, + "completions/mean_terminated_length": 195.6875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.24564037417715673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.01023086033819709, + "learning_rate": 3.748e-06, + "loss": 0.0004, + "num_tokens": 49428934.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 187.375, + "completions/mean_terminated_length": 187.375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.24575586095392077, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.0172874522395432, + "learning_rate": 3.746e-06, + "loss": 0.0007, + "num_tokens": 49454290.0, + "reward": 3.416182279586792, + "reward_std": 0.6429321765899658, + "rewards/reward_fn/mean": 3.416182279586792, + "rewards/reward_fn/std": 0.6429321765899658, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 254.5, + "completions/mean_terminated_length": 254.5, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.24587134773068484, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.03087627622880973, + "learning_rate": 3.744e-06, + "loss": 0.0012, + "num_tokens": 49482626.0, + "reward": 3.470036029815674, + "reward_std": 0.35796692967414856, + "rewards/reward_fn/mean": 3.470036029815674, + "rewards/reward_fn/std": 0.3579668700695038, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 151.25, + "completions/mean_terminated_length": 151.25, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.2459868345074489, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.00818801106652245, + "learning_rate": 3.742e-06, + "loss": 0.0003, + "num_tokens": 49507786.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 54.5, + "completions/mean_terminated_length": 54.5, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.24610232128421297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3515625, + "kl": 0.017408482686732896, + "learning_rate": 3.74e-06, + "loss": 0.0007, + "num_tokens": 49521274.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 117.0, + "completions/mean_terminated_length": 117.0, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.246217808060977, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "kl": 0.009915981776430272, + "learning_rate": 3.7379999999999996e-06, + "loss": 0.0004, + "num_tokens": 49541050.0, + "reward": 3.877242088317871, + "reward_std": 0.3339214622974396, + "rewards/reward_fn/mean": 3.877242088317871, + "rewards/reward_fn/std": 0.3339214324951172, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 152.1875, + "completions/mean_terminated_length": 152.1875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.24633329483774108, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.009860787497018464, + "learning_rate": 3.736e-06, + "loss": 0.0004, + "num_tokens": 49564768.0, + "reward": 3.9757509231567383, + "reward_std": 0.13717417418956757, + "rewards/reward_fn/mean": 3.9757509231567383, + "rewards/reward_fn/std": 0.13717415928840637, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 79.1875, + "completions/mean_terminated_length": 79.1875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.24644878161450515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.0074907307025569025, + "learning_rate": 3.7339999999999997e-06, + "loss": 0.0003, + "num_tokens": 49575462.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 301.0625, + "completions/mean_terminated_length": 301.0625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.2465642683912692, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.012227718441863544, + "learning_rate": 3.732e-06, + "loss": 0.0005, + "num_tokens": 49608936.0, + "reward": 3.759439706802368, + "reward_std": 0.4930351972579956, + "rewards/reward_fn/mean": 3.759439706802368, + "rewards/reward_fn/std": 0.4930351972579956, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 178.5, + "completions/mean_terminated_length": 178.5, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.24667975516803325, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "kl": 0.01209472119808197, + "learning_rate": 3.73e-06, + "loss": 0.0005, + "num_tokens": 49622648.0, + "reward": 3.976452112197876, + "reward_std": 0.1332072764635086, + "rewards/reward_fn/mean": 3.976452112197876, + "rewards/reward_fn/std": 0.1332072615623474, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 142.4375, + "completions/mean_terminated_length": 142.4375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.24679524194479732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.008287184231448919, + "learning_rate": 3.728e-06, + "loss": 0.0003, + "num_tokens": 49640774.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 260.3125, + "completions/mean_terminated_length": 260.3125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.2469107287215614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.010420545062515885, + "learning_rate": 3.726e-06, + "loss": 0.0004, + "num_tokens": 49664176.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 143.84375, + "completions/mean_terminated_length": 143.84375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.24702621549832543, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.011886260173923802, + "learning_rate": 3.724e-06, + "loss": 0.0005, + "num_tokens": 49680555.0, + "reward": 3.8594048023223877, + "reward_std": 0.5532682538032532, + "rewards/reward_fn/mean": 3.8594048023223877, + "rewards/reward_fn/std": 0.5532682538032532, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 175.09375, + "completions/mean_terminated_length": 175.09375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.2471417022750895, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.011129967722808942, + "learning_rate": 3.7219999999999997e-06, + "loss": 0.0004, + "num_tokens": 49707214.0, + "reward": 3.6559293270111084, + "reward_std": 0.735634446144104, + "rewards/reward_fn/mean": 3.6559293270111084, + "rewards/reward_fn/std": 0.735634446144104, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 228.25, + "completions/mean_terminated_length": 228.25, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.24725718905185357, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.009963623735529836, + "learning_rate": 3.72e-06, + "loss": 0.0004, + "num_tokens": 49736982.0, + "reward": 3.9847846031188965, + "reward_std": 0.08607178181409836, + "rewards/reward_fn/mean": 3.9847846031188965, + "rewards/reward_fn/std": 0.08607174456119537, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 99.9375, + "completions/mean_terminated_length": 99.9375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.24737267582861763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.01332107448251918, + "learning_rate": 3.718e-06, + "loss": 0.0005, + "num_tokens": 49757972.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 386.15625, + "completions/mean_terminated_length": 386.15625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.24748816260538167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0419921875, + "kl": 0.008977302604762372, + "learning_rate": 3.716e-06, + "loss": 0.0004, + "num_tokens": 49780025.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 329.71875, + "completions/mean_terminated_length": 329.71875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.24760364938214574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.01096777465136256, + "learning_rate": 3.714e-06, + "loss": 0.0004, + "num_tokens": 49800176.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 143.53125, + "completions/mean_terminated_length": 143.53125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.2477191361589098, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.0077344626624835655, + "learning_rate": 3.712e-06, + "loss": 0.0003, + "num_tokens": 49822689.0, + "reward": 3.9702539443969727, + "reward_std": 0.1682698279619217, + "rewards/reward_fn/mean": 3.9702539443969727, + "rewards/reward_fn/std": 0.16826985776424408, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 142.09375, + "completions/mean_terminated_length": 142.09375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.24783462293567388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.010272190425894223, + "learning_rate": 3.7099999999999996e-06, + "loss": 0.0004, + "num_tokens": 49851460.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 222.34375, + "completions/mean_terminated_length": 222.34375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.24795010971243792, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.010766115221485961, + "learning_rate": 3.708e-06, + "loss": 0.0004, + "num_tokens": 49874127.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 263.71875, + "completions/mean_terminated_length": 263.71875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.24806559648920198, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.012008916644845158, + "learning_rate": 3.7059999999999998e-06, + "loss": 0.0005, + "num_tokens": 49902342.0, + "reward": 3.9289932250976562, + "reward_std": 0.40167468786239624, + "rewards/reward_fn/mean": 3.9289932250976562, + "rewards/reward_fn/std": 0.401674747467041, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 307.9375, + "completions/mean_terminated_length": 307.9375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.24818108326596605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.014812837383942679, + "learning_rate": 3.704e-06, + "loss": 0.0006, + "num_tokens": 49923364.0, + "reward": 3.3547654151916504, + "reward_std": 1.0480890274047852, + "rewards/reward_fn/mean": 3.3547654151916504, + "rewards/reward_fn/std": 1.0480889081954956, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 103.90625, + "completions/mean_terminated_length": 103.90625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.24829657004273012, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.01074177789996611, + "learning_rate": 3.702e-06, + "loss": 0.0004, + "num_tokens": 49942593.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.24841205681949416, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.009570764261297882, + "learning_rate": 3.7e-06, + "loss": 0.0004, + "num_tokens": 49979208.0, + "reward": 3.9635114669799805, + "reward_std": 0.20641013979911804, + "rewards/reward_fn/mean": 3.9635114669799805, + "rewards/reward_fn/std": 0.20641009509563446, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1180.0, + "completions/max_terminated_length": 1180.0, + "completions/mean_length": 431.25, + "completions/mean_terminated_length": 431.25, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.24852754359625823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.012045554904034361, + "learning_rate": 3.6979999999999996e-06, + "loss": 0.0005, + "num_tokens": 50001360.0, + "reward": 2.5795063972473145, + "reward_std": 0.7364576458930969, + "rewards/reward_fn/mean": 2.5795063972473145, + "rewards/reward_fn/std": 0.7364576458930969, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 81.0625, + "completions/mean_terminated_length": 81.0625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.2486430303730223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.004908867769700009, + "learning_rate": 3.696e-06, + "loss": 0.0002, + "num_tokens": 50016562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 164.53125, + "completions/mean_terminated_length": 164.53125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.24875851714978636, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.005826296815939713, + "learning_rate": 3.6939999999999997e-06, + "loss": 0.0002, + "num_tokens": 50043811.0, + "reward": 3.9744513034820557, + "reward_std": 0.14452558755874634, + "rewards/reward_fn/mean": 3.9744513034820557, + "rewards/reward_fn/std": 0.14452557265758514, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 231.84375, + "completions/mean_terminated_length": 231.84375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.2488740039265504, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.00784140353789553, + "learning_rate": 3.692e-06, + "loss": 0.0003, + "num_tokens": 50074526.0, + "reward": 3.9236321449279785, + "reward_std": 0.30056554079055786, + "rewards/reward_fn/mean": 3.9236321449279785, + "rewards/reward_fn/std": 0.30056557059288025, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 111.75, + "completions/mean_terminated_length": 111.75, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.24898949070331447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040771484375, + "kl": 0.0040281211367982905, + "learning_rate": 3.69e-06, + "loss": 0.0002, + "num_tokens": 50106934.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 328.21875, + "completions/mean_terminated_length": 328.21875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.24910497748007854, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.71875, + "kl": 0.012351596844382584, + "learning_rate": 3.688e-06, + "loss": 0.0005, + "num_tokens": 50130333.0, + "reward": 3.85811185836792, + "reward_std": 0.5583300590515137, + "rewards/reward_fn/mean": 3.85811185836792, + "rewards/reward_fn/std": 0.5583301186561584, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 205.15625, + "completions/mean_terminated_length": 205.15625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.2492204642568426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.016270814536255784, + "learning_rate": 3.686e-06, + "loss": 0.0007, + "num_tokens": 50149698.0, + "reward": 3.8113620281219482, + "reward_std": 0.4001259207725525, + "rewards/reward_fn/mean": 3.8113620281219482, + "rewards/reward_fn/std": 0.4001259207725525, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 112.125, + "completions/mean_terminated_length": 112.125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.24933595103360665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.010576930741081014, + "learning_rate": 3.684e-06, + "loss": 0.0004, + "num_tokens": 50166086.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 143.25, + "completions/mean_terminated_length": 143.25, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.2494514378103707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.00881895035126945, + "learning_rate": 3.6819999999999996e-06, + "loss": 0.0004, + "num_tokens": 50188206.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 154.15625, + "completions/mean_terminated_length": 154.15625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.24956692458713478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.0087562359549338, + "learning_rate": 3.68e-06, + "loss": 0.0004, + "num_tokens": 50219475.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 121.75, + "completions/mean_terminated_length": 121.75, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.24968241136389882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.010406331886770204, + "learning_rate": 3.6779999999999998e-06, + "loss": 0.0004, + "num_tokens": 50239147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 342.0625, + "completions/mean_terminated_length": 342.0625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.2497978981406629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03466796875, + "kl": 0.00918240871396847, + "learning_rate": 3.676e-06, + "loss": 0.0004, + "num_tokens": 50265741.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 133.9375, + "completions/mean_terminated_length": 133.9375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.24991338491742696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.012750824564136565, + "learning_rate": 3.674e-06, + "loss": 0.0005, + "num_tokens": 50291179.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 346.0, + "completions/mean_terminated_length": 346.0, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.250028871694191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.009759487889823504, + "learning_rate": 3.672e-06, + "loss": 0.0004, + "num_tokens": 50317483.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 252.4375, + "completions/mean_terminated_length": 252.4375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.25014435847095506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.00985797092289431, + "learning_rate": 3.6699999999999996e-06, + "loss": 0.0004, + "num_tokens": 50344473.0, + "reward": 3.0636677742004395, + "reward_std": 0.24728263914585114, + "rewards/reward_fn/mean": 3.0636677742004395, + "rewards/reward_fn/std": 0.24728263914585114, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 296.0625, + "completions/mean_terminated_length": 296.0625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.25025984524771916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.010876013286178932, + "learning_rate": 3.668e-06, + "loss": 0.0004, + "num_tokens": 50380123.0, + "reward": 2.9285826683044434, + "reward_std": 0.11617077142000198, + "rewards/reward_fn/mean": 2.9285826683044434, + "rewards/reward_fn/std": 0.11617080122232437, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 401.40625, + "completions/mean_terminated_length": 401.40625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.2503753320244832, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.008303324517328292, + "learning_rate": 3.6659999999999997e-06, + "loss": 0.0003, + "num_tokens": 50402824.0, + "reward": 3.8571324348449707, + "reward_std": 0.5621849298477173, + "rewards/reward_fn/mean": 3.8571324348449707, + "rewards/reward_fn/std": 0.5621849894523621, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 337.59375, + "completions/mean_terminated_length": 337.59375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.25049081880124724, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.013427655401756056, + "learning_rate": 3.664e-06, + "loss": 0.0005, + "num_tokens": 50431835.0, + "reward": 3.5988738536834717, + "reward_std": 0.7323111295700073, + "rewards/reward_fn/mean": 3.5988738536834717, + "rewards/reward_fn/std": 0.7323111295700073, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 112.5625, + "completions/mean_terminated_length": 112.5625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.25060630557801133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.007245955792313907, + "learning_rate": 3.662e-06, + "loss": 0.0003, + "num_tokens": 50447309.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 210.5625, + "completions/mean_terminated_length": 210.5625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.2507217923547754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.007344278979871888, + "learning_rate": 3.66e-06, + "loss": 0.0003, + "num_tokens": 50472703.0, + "reward": 3.974837064743042, + "reward_std": 0.14234299957752228, + "rewards/reward_fn/mean": 3.974837064743042, + "rewards/reward_fn/std": 0.1423429697751999, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 124.375, + "completions/mean_terminated_length": 124.375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.2508372791315394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.009674550074123545, + "learning_rate": 3.658e-06, + "loss": 0.0004, + "num_tokens": 50489387.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 90.21875, + "completions/mean_terminated_length": 90.21875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.2509527659083035, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1875, + "kl": 0.015564827539492399, + "learning_rate": 3.656e-06, + "loss": 0.0006, + "num_tokens": 50510482.0, + "reward": 3.480426788330078, + "reward_std": 0.017540594562888145, + "rewards/reward_fn/mean": 3.480426788330078, + "rewards/reward_fn/std": 0.01754056289792061, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1347.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 310.46875, + "completions/mean_terminated_length": 310.46875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.25106825268506755, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.013885303211282007, + "learning_rate": 3.6539999999999997e-06, + "loss": 0.0006, + "num_tokens": 50532705.0, + "reward": 3.7901735305786133, + "reward_std": 0.6628355383872986, + "rewards/reward_fn/mean": 3.7901735305786133, + "rewards/reward_fn/std": 0.6628354787826538, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 260.09375, + "completions/mean_terminated_length": 260.09375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.25118373946183165, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.017679850570857525, + "learning_rate": 3.652e-06, + "loss": 0.0007, + "num_tokens": 50564676.0, + "reward": 3.5434861183166504, + "reward_std": 0.6904910206794739, + "rewards/reward_fn/mean": 3.5434861183166504, + "rewards/reward_fn/std": 0.6904910802841187, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 138.40625, + "completions/mean_terminated_length": 138.40625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.2512992262385957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.009829045331571251, + "learning_rate": 3.6499999999999998e-06, + "loss": 0.0004, + "num_tokens": 50582641.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 120.5625, + "completions/mean_terminated_length": 120.5625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.2514147130153597, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.010087804344948381, + "learning_rate": 3.648e-06, + "loss": 0.0004, + "num_tokens": 50602435.0, + "reward": 3.937424659729004, + "reward_std": 0.2008819580078125, + "rewards/reward_fn/mean": 3.937424659729004, + "rewards/reward_fn/std": 0.2008819729089737, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 353.0, + "completions/mean_terminated_length": 353.0, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.2515301997921238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.009497121354797855, + "learning_rate": 3.646e-06, + "loss": 0.0004, + "num_tokens": 50624579.0, + "reward": 3.9292407035827637, + "reward_std": 0.40027540922164917, + "rewards/reward_fn/mean": 3.9292407035827637, + "rewards/reward_fn/std": 0.40027540922164917, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 132.96875, + "completions/mean_terminated_length": 132.96875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.25164568656888786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.011370569482096471, + "learning_rate": 3.6439999999999998e-06, + "loss": 0.0005, + "num_tokens": 50650722.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 204.375, + "completions/mean_terminated_length": 204.375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.2517611733456519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.01292159476724919, + "learning_rate": 3.6419999999999996e-06, + "loss": 0.0005, + "num_tokens": 50669806.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 145.28125, + "completions/mean_terminated_length": 145.28125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.251876660122416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.006015627001033863, + "learning_rate": 3.64e-06, + "loss": 0.0002, + "num_tokens": 50702455.0, + "reward": 3.92844820022583, + "reward_std": 0.40475744009017944, + "rewards/reward_fn/mean": 3.92844820022583, + "rewards/reward_fn/std": 0.4047574996948242, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 77.59375, + "completions/mean_terminated_length": 77.59375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.25199214689918004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.004719783461041516, + "learning_rate": 3.6379999999999997e-06, + "loss": 0.0002, + "num_tokens": 50716714.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 153.40625, + "completions/mean_terminated_length": 153.40625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.25210763367594413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.012227102619362995, + "learning_rate": 3.636e-06, + "loss": 0.0005, + "num_tokens": 50744247.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 65.8125, + "completions/mean_terminated_length": 65.8125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.25222312045270817, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.008050021744566038, + "learning_rate": 3.634e-06, + "loss": 0.0003, + "num_tokens": 50755889.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 86.0, + "completions/mean_terminated_length": 86.0, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.2523386072294722, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.015073406626470387, + "learning_rate": 3.632e-06, + "loss": 0.0006, + "num_tokens": 50777137.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 166.65625, + "completions/mean_terminated_length": 166.65625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.2524540940062363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.009434665596927516, + "learning_rate": 3.6299999999999995e-06, + "loss": 0.0004, + "num_tokens": 50799142.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 254.90625, + "completions/mean_terminated_length": 254.90625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.25256958078300035, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.009545746215735562, + "learning_rate": 3.628e-06, + "loss": 0.0004, + "num_tokens": 50828867.0, + "reward": 3.8640174865722656, + "reward_std": 0.36683356761932373, + "rewards/reward_fn/mean": 3.8640174865722656, + "rewards/reward_fn/std": 0.36683356761932373, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 109.9375, + "completions/mean_terminated_length": 109.9375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.2526850675597644, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.017984884718316607, + "learning_rate": 3.6259999999999997e-06, + "loss": 0.0007, + "num_tokens": 50853217.0, + "reward": 3.483107089996338, + "reward_std": 0.1408856213092804, + "rewards/reward_fn/mean": 3.483107089996338, + "rewards/reward_fn/std": 0.1408856213092804, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 263.03125, + "completions/mean_terminated_length": 263.03125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.2528005543365285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.015255685342708603, + "learning_rate": 3.624e-06, + "loss": 0.0006, + "num_tokens": 50880194.0, + "reward": 3.645634174346924, + "reward_std": 0.4331796169281006, + "rewards/reward_fn/mean": 3.645634174346924, + "rewards/reward_fn/std": 0.4331795871257782, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 118.75, + "completions/mean_terminated_length": 118.75, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.2529160411132925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.012957731596543454, + "learning_rate": 3.622e-06, + "loss": 0.0005, + "num_tokens": 50897018.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 196.25, + "completions/mean_terminated_length": 196.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.25303152789005656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.011928291292861104, + "learning_rate": 3.62e-06, + "loss": 0.0005, + "num_tokens": 50915106.0, + "reward": 3.961554527282715, + "reward_std": 0.2174801379442215, + "rewards/reward_fn/mean": 3.961554527282715, + "rewards/reward_fn/std": 0.2174801379442215, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 199.25, + "completions/mean_terminated_length": 199.25, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.25314701466682066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041748046875, + "kl": 0.00793367919686716, + "learning_rate": 3.618e-06, + "loss": 0.0003, + "num_tokens": 50932490.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 292.15625, + "completions/mean_terminated_length": 292.15625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.2532625014435847, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.013909784422139637, + "learning_rate": 3.6159999999999998e-06, + "loss": 0.0006, + "num_tokens": 50953487.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 95.21875, + "completions/mean_terminated_length": 95.21875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.2533779882203488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.007281954160134774, + "learning_rate": 3.6139999999999996e-06, + "loss": 0.0003, + "num_tokens": 50968694.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 111.25, + "completions/mean_terminated_length": 111.25, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.25349347499711283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.006871525896713138, + "learning_rate": 3.612e-06, + "loss": 0.0003, + "num_tokens": 50994846.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 82.65625, + "completions/mean_terminated_length": 82.65625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.2536089617738769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.008404011445236392, + "learning_rate": 3.6099999999999997e-06, + "loss": 0.0003, + "num_tokens": 51013651.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 95.78125, + "completions/mean_terminated_length": 95.78125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.25372444855064097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.005398306027927902, + "learning_rate": 3.608e-06, + "loss": 0.0002, + "num_tokens": 51039148.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 87.0625, + "completions/mean_terminated_length": 87.0625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.253839935327405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.006070589637602097, + "learning_rate": 3.606e-06, + "loss": 0.0002, + "num_tokens": 51064110.0, + "reward": 3.926403760910034, + "reward_std": 0.41632360219955444, + "rewards/reward_fn/mean": 3.926403760910034, + "rewards/reward_fn/std": 0.41632354259490967, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 207.4375, + "completions/mean_terminated_length": 207.4375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.25395542210416905, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.004941421710100258, + "learning_rate": 3.604e-06, + "loss": 0.0002, + "num_tokens": 51084284.0, + "reward": 3.104555368423462, + "reward_std": 0.3474453389644623, + "rewards/reward_fn/mean": 3.104555368423462, + "rewards/reward_fn/std": 0.3474453091621399, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 224.8125, + "completions/mean_terminated_length": 224.8125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.25407090888093314, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.013474018720444292, + "learning_rate": 3.6019999999999996e-06, + "loss": 0.0005, + "num_tokens": 51110902.0, + "reward": 3.8249471187591553, + "reward_std": 0.34267786145210266, + "rewards/reward_fn/mean": 3.8249471187591553, + "rewards/reward_fn/std": 0.34267786145210266, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 286.59375, + "completions/mean_terminated_length": 286.59375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.2541863956576972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.013438812689855695, + "learning_rate": 3.6e-06, + "loss": 0.0005, + "num_tokens": 51133705.0, + "reward": 3.9315221309661865, + "reward_std": 0.38736966252326965, + "rewards/reward_fn/mean": 3.9315221309661865, + "rewards/reward_fn/std": 0.38736969232559204, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 202.5, + "completions/mean_terminated_length": 202.5, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.2543018824344613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.015505524614127353, + "learning_rate": 3.5979999999999997e-06, + "loss": 0.0006, + "num_tokens": 51162041.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 85.0625, + "completions/mean_terminated_length": 85.0625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.2544173692112253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.004857354406340164, + "learning_rate": 3.596e-06, + "loss": 0.0002, + "num_tokens": 51186267.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 297.5625, + "completions/mean_terminated_length": 297.5625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.25453285598798936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.01341590350784827, + "learning_rate": 3.594e-06, + "loss": 0.0005, + "num_tokens": 51204173.0, + "reward": 3.2210028171539307, + "reward_std": 0.5050860047340393, + "rewards/reward_fn/mean": 3.2210028171539307, + "rewards/reward_fn/std": 0.5050860047340393, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 224.90625, + "completions/mean_terminated_length": 224.90625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.25464834276475345, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.013352198788197711, + "learning_rate": 3.592e-06, + "loss": 0.0005, + "num_tokens": 51234634.0, + "reward": 3.0143399238586426, + "reward_std": 0.1032041683793068, + "rewards/reward_fn/mean": 3.0143399238586426, + "rewards/reward_fn/std": 0.103204146027565, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 119.96875, + "completions/mean_terminated_length": 119.96875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.2547638295415175, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.01712460062117316, + "learning_rate": 3.5899999999999995e-06, + "loss": 0.0007, + "num_tokens": 51251177.0, + "reward": 3.9721555709838867, + "reward_std": 0.15751120448112488, + "rewards/reward_fn/mean": 3.9721555709838867, + "rewards/reward_fn/std": 0.15751120448112488, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 270.59375, + "completions/mean_terminated_length": 270.59375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.25487931631828153, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.012239973148098215, + "learning_rate": 3.5879999999999998e-06, + "loss": 0.0005, + "num_tokens": 51281020.0, + "reward": 2.9156036376953125, + "reward_std": 0.4379645586013794, + "rewards/reward_fn/mean": 2.9156036376953125, + "rewards/reward_fn/std": 0.4379644989967346, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 137.25, + "completions/mean_terminated_length": 137.25, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.25499480309504563, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.012349739467026666, + "learning_rate": 3.5859999999999996e-06, + "loss": 0.0005, + "num_tokens": 51309636.0, + "reward": 3.9402360916137695, + "reward_std": 0.23517529666423798, + "rewards/reward_fn/mean": 3.9402360916137695, + "rewards/reward_fn/std": 0.23517531156539917, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 234.9375, + "completions/mean_terminated_length": 234.9375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.25511028987180967, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.010981939834891818, + "learning_rate": 3.584e-06, + "loss": 0.0004, + "num_tokens": 51332290.0, + "reward": 3.928896903991699, + "reward_std": 0.4022190570831299, + "rewards/reward_fn/mean": 3.928896903991699, + "rewards/reward_fn/std": 0.4022189974784851, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 204.625, + "completions/mean_terminated_length": 204.625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.25522577664857377, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.015130218453123234, + "learning_rate": 3.5819999999999998e-06, + "loss": 0.0006, + "num_tokens": 51365814.0, + "reward": 3.9346189498901367, + "reward_std": 0.2611849009990692, + "rewards/reward_fn/mean": 3.9346189498901367, + "rewards/reward_fn/std": 0.2611849308013916, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 54.5625, + "completions/mean_terminated_length": 54.5625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.2553412634253378, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.65625, + "kl": 0.016933049013459822, + "learning_rate": 3.58e-06, + "loss": 0.0007, + "num_tokens": 51380680.0, + "reward": 3.75, + "reward_std": 0.9837387204170227, + "rewards/reward_fn/mean": 3.75, + "rewards/reward_fn/std": 0.9837387204170227, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 97.21875, + "completions/mean_terminated_length": 97.21875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.25545675020210185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.003464322142463061, + "learning_rate": 3.578e-06, + "loss": 0.0001, + "num_tokens": 51399855.0, + "reward": 3.959615707397461, + "reward_std": 0.1657210886478424, + "rewards/reward_fn/mean": 3.959615707397461, + "rewards/reward_fn/std": 0.16572105884552002, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 74.78125, + "completions/mean_terminated_length": 74.78125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.25557223697886594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.007973129038873594, + "learning_rate": 3.5759999999999997e-06, + "loss": 0.0003, + "num_tokens": 51413672.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 161.21875, + "completions/mean_terminated_length": 161.21875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.25568772375563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.00778700861701509, + "learning_rate": 3.5739999999999996e-06, + "loss": 0.0003, + "num_tokens": 51441807.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 191.09375, + "completions/mean_terminated_length": 191.09375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.255803210532394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.007960332401125925, + "learning_rate": 3.572e-06, + "loss": 0.0003, + "num_tokens": 51471698.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 79.6875, + "completions/mean_terminated_length": 79.6875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.2559186973091581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.0064514343393966556, + "learning_rate": 3.5699999999999997e-06, + "loss": 0.0003, + "num_tokens": 51498344.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 202.125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.25603418408592216, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.013706686571822502, + "learning_rate": 3.568e-06, + "loss": 0.0005, + "num_tokens": 51522956.0, + "reward": 3.25799560546875, + "reward_std": 1.0636632442474365, + "rewards/reward_fn/mean": 3.25799560546875, + "rewards/reward_fn/std": 1.0636632442474365, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 228.15625, + "completions/mean_terminated_length": 228.15625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.2561496708626862, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.009029381908476353, + "learning_rate": 3.566e-06, + "loss": 0.0004, + "num_tokens": 51538353.0, + "reward": 3.930619716644287, + "reward_std": 0.39247429370880127, + "rewards/reward_fn/mean": 3.930619716644287, + "rewards/reward_fn/std": 0.39247429370880127, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 132.5, + "completions/mean_terminated_length": 132.5, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.2562651576394503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.010977185011142865, + "learning_rate": 3.564e-06, + "loss": 0.0004, + "num_tokens": 51554433.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 187.84375, + "completions/mean_terminated_length": 187.84375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.25638064441621433, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.008923199624405243, + "learning_rate": 3.5619999999999995e-06, + "loss": 0.0004, + "num_tokens": 51573052.0, + "reward": 3.9770216941833496, + "reward_std": 0.12998411059379578, + "rewards/reward_fn/mean": 3.9770216941833496, + "rewards/reward_fn/std": 0.1299840807914734, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 231.90625, + "completions/mean_terminated_length": 231.90625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2564961311929784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.0119397061644122, + "learning_rate": 3.56e-06, + "loss": 0.0005, + "num_tokens": 51606169.0, + "reward": 3.9611966609954834, + "reward_std": 0.1542895883321762, + "rewards/reward_fn/mean": 3.9611966609954834, + "rewards/reward_fn/std": 0.15428955852985382, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 223.5, + "completions/mean_terminated_length": 223.5, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.25661161796974247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.01235925534274429, + "learning_rate": 3.5579999999999996e-06, + "loss": 0.0005, + "num_tokens": 51625929.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 103.6875, + "completions/mean_terminated_length": 103.6875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.2567271047465065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.011223699173569912, + "learning_rate": 3.556e-06, + "loss": 0.0004, + "num_tokens": 51641375.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 151.875, + "completions/mean_terminated_length": 151.875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.2568425915232706, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.011600740122958086, + "learning_rate": 3.5539999999999998e-06, + "loss": 0.0005, + "num_tokens": 51661243.0, + "reward": 3.9319701194763184, + "reward_std": 0.38483551144599915, + "rewards/reward_fn/mean": 3.9319701194763184, + "rewards/reward_fn/std": 0.3848355710506439, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 76.53125, + "completions/mean_terminated_length": 76.53125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.25695807830003464, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.984375, + "kl": 0.010688884125556797, + "learning_rate": 3.552e-06, + "loss": 0.0004, + "num_tokens": 51685324.0, + "reward": 3.9865059852600098, + "reward_std": 0.07633397728204727, + "rewards/reward_fn/mean": 3.9865059852600098, + "rewards/reward_fn/std": 0.07633396983146667, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 291.125, + "completions/mean_terminated_length": 291.125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.2570735650767987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296875, + "kl": 0.009117140740272589, + "learning_rate": 3.5499999999999995e-06, + "loss": 0.0004, + "num_tokens": 51706672.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 336.375, + "completions/mean_terminated_length": 336.375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.2571890518535628, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.010666519490769133, + "learning_rate": 3.5479999999999997e-06, + "loss": 0.0004, + "num_tokens": 51733660.0, + "reward": 3.7224557399749756, + "reward_std": 0.7466903328895569, + "rewards/reward_fn/mean": 3.7224557399749756, + "rewards/reward_fn/std": 0.7466902732849121, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 102.0625, + "completions/mean_terminated_length": 102.0625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.2573045386303268, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.015340666723204777, + "learning_rate": 3.5459999999999996e-06, + "loss": 0.0006, + "num_tokens": 51755838.0, + "reward": 3.574388265609741, + "reward_std": 0.06643790751695633, + "rewards/reward_fn/mean": 3.574388265609741, + "rewards/reward_fn/std": 0.06643790751695633, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 450.46875, + "completions/mean_terminated_length": 450.46875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.2574200254070909, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.010526090525672771, + "learning_rate": 3.544e-06, + "loss": 0.0004, + "num_tokens": 51799533.0, + "reward": 3.5891644954681396, + "reward_std": 0.718937873840332, + "rewards/reward_fn/mean": 3.5891644954681396, + "rewards/reward_fn/std": 0.718937873840332, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 182.0, + "completions/mean_terminated_length": 182.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.25753551218385495, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.019938460565754212, + "learning_rate": 3.5419999999999997e-06, + "loss": 0.0008, + "num_tokens": 51827469.0, + "reward": 3.864237070083618, + "reward_std": 0.3238731026649475, + "rewards/reward_fn/mean": 3.864237070083618, + "rewards/reward_fn/std": 0.3238731026649475, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 387.71875, + "completions/mean_terminated_length": 387.71875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.257650998960619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.008730275338166393, + "learning_rate": 3.54e-06, + "loss": 0.0003, + "num_tokens": 51861476.0, + "reward": 3.860870361328125, + "reward_std": 0.3289678394794464, + "rewards/reward_fn/mean": 3.860870361328125, + "rewards/reward_fn/std": 0.328967809677124, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 502.71875, + "completions/mean_terminated_length": 502.71875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.2577664857373831, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9296875, + "kl": 0.008219362091040239, + "learning_rate": 3.538e-06, + "loss": 0.0003, + "num_tokens": 51887163.0, + "reward": 3.9320781230926514, + "reward_std": 0.3842242360115051, + "rewards/reward_fn/mean": 3.9320781230926514, + "rewards/reward_fn/std": 0.3842242658138275, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 266.65625, + "completions/mean_terminated_length": 266.65625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.25788197251414713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.008617438506917097, + "learning_rate": 3.5359999999999997e-06, + "loss": 0.0003, + "num_tokens": 51911728.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 168.90625, + "completions/mean_terminated_length": 168.90625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.25799745929091117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.007159134111134335, + "learning_rate": 3.5339999999999995e-06, + "loss": 0.0003, + "num_tokens": 51929933.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 228.46875, + "completions/mean_terminated_length": 228.46875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.25811294606767526, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.010695525241317227, + "learning_rate": 3.532e-06, + "loss": 0.0004, + "num_tokens": 51949564.0, + "reward": 3.9288508892059326, + "reward_std": 0.4024798274040222, + "rewards/reward_fn/mean": 3.9288508892059326, + "rewards/reward_fn/std": 0.4024798274040222, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 251.09375, + "completions/mean_terminated_length": 251.09375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.2582284328444393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038818359375, + "kl": 0.007816721423296258, + "learning_rate": 3.5299999999999997e-06, + "loss": 0.0003, + "num_tokens": 51972639.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 92.6875, + "completions/mean_terminated_length": 92.6875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.2583439196212034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.004357081917078176, + "learning_rate": 3.528e-06, + "loss": 0.0002, + "num_tokens": 51995029.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 105.46875, + "completions/mean_terminated_length": 105.46875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.25845940639796744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.007492932287277654, + "learning_rate": 3.5259999999999998e-06, + "loss": 0.0003, + "num_tokens": 52018116.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 123.46875, + "completions/mean_terminated_length": 123.46875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.2585748931747315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.007493940443964675, + "learning_rate": 3.524e-06, + "loss": 0.0003, + "num_tokens": 52036435.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 107.25, + "completions/mean_terminated_length": 107.25, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.2586903799514956, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.01177442993503064, + "learning_rate": 3.5219999999999995e-06, + "loss": 0.0005, + "num_tokens": 52056923.0, + "reward": 3.1260666847229004, + "reward_std": 0.12646375596523285, + "rewards/reward_fn/mean": 3.1260666847229004, + "rewards/reward_fn/std": 0.12646372616291046, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 121.34375, + "completions/mean_terminated_length": 121.34375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.2588058667282596, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.017318805868853815, + "learning_rate": 3.5199999999999998e-06, + "loss": 0.0007, + "num_tokens": 52088262.0, + "reward": 3.968508720397949, + "reward_std": 0.17814156413078308, + "rewards/reward_fn/mean": 3.968508720397949, + "rewards/reward_fn/std": 0.1781415343284607, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 114.1875, + "completions/mean_terminated_length": 114.1875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.25892135350502365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.004875977638221229, + "learning_rate": 3.5179999999999996e-06, + "loss": 0.0002, + "num_tokens": 52115020.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 226.46875, + "completions/mean_terminated_length": 226.46875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.25903684028178775, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.010596410982543603, + "learning_rate": 3.516e-06, + "loss": 0.0004, + "num_tokens": 52146107.0, + "reward": 3.371830463409424, + "reward_std": 0.31774941086769104, + "rewards/reward_fn/mean": 3.371830463409424, + "rewards/reward_fn/std": 0.31774941086769104, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 229.9375, + "completions/mean_terminated_length": 229.9375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.2591523270585518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.012200534183648415, + "learning_rate": 3.5139999999999997e-06, + "loss": 0.0005, + "num_tokens": 52179097.0, + "reward": 3.8276994228363037, + "reward_std": 0.4626365900039673, + "rewards/reward_fn/mean": 3.8276994228363037, + "rewards/reward_fn/std": 0.4626365602016449, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 69.15625, + "completions/mean_terminated_length": 69.15625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.25926781383531583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.006574294222446042, + "learning_rate": 3.512e-06, + "loss": 0.0003, + "num_tokens": 52192670.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 68.84375, + "completions/mean_terminated_length": 68.84375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.2593833006120799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2060546875, + "kl": 0.014595413726055995, + "learning_rate": 3.5099999999999994e-06, + "loss": 0.0006, + "num_tokens": 52204505.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 141.0625, + "completions/mean_terminated_length": 141.0625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.25949878738884397, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.010237839400360826, + "learning_rate": 3.5079999999999997e-06, + "loss": 0.0004, + "num_tokens": 52220859.0, + "reward": 3.931131362915039, + "reward_std": 0.38958045840263367, + "rewards/reward_fn/mean": 3.931131362915039, + "rewards/reward_fn/std": 0.3895804286003113, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 301.96875, + "completions/mean_terminated_length": 301.96875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.25961427416560806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.010360757252783515, + "learning_rate": 3.5059999999999996e-06, + "loss": 0.0004, + "num_tokens": 52253466.0, + "reward": 3.6318325996398926, + "reward_std": 0.7878829836845398, + "rewards/reward_fn/mean": 3.6318325996398926, + "rewards/reward_fn/std": 0.787882924079895, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 111.40625, + "completions/mean_terminated_length": 111.40625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.2597297609423721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.006439149039579206, + "learning_rate": 3.504e-06, + "loss": 0.0003, + "num_tokens": 52269415.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 158.96875, + "completions/mean_terminated_length": 158.96875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.25984524771913614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.013682861972483806, + "learning_rate": 3.5019999999999997e-06, + "loss": 0.0005, + "num_tokens": 52286758.0, + "reward": 3.971463203430176, + "reward_std": 0.16142767667770386, + "rewards/reward_fn/mean": 3.971463203430176, + "rewards/reward_fn/std": 0.16142766177654266, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 405.125, + "completions/mean_terminated_length": 405.125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.25996073449590024, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.009373388384119608, + "learning_rate": 3.5e-06, + "loss": 0.0004, + "num_tokens": 52309418.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 134.4375, + "completions/mean_terminated_length": 134.4375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.2600762212726643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.00839967506180983, + "learning_rate": 3.4980000000000002e-06, + "loss": 0.0003, + "num_tokens": 52328760.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 182.4375, + "completions/mean_terminated_length": 182.4375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.2601917080494283, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.017460784554714337, + "learning_rate": 3.4959999999999996e-06, + "loss": 0.0007, + "num_tokens": 52357030.0, + "reward": 3.9509549140930176, + "reward_std": 0.16055729985237122, + "rewards/reward_fn/mean": 3.9509549140930176, + "rewards/reward_fn/std": 0.16055729985237122, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 176.96875, + "completions/mean_terminated_length": 176.96875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.2603071948261924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11376953125, + "kl": 0.016813859314424917, + "learning_rate": 3.494e-06, + "loss": 0.0007, + "num_tokens": 52385157.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 89.125, + "completions/mean_terminated_length": 89.125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.26042268160295645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.00519195059860067, + "learning_rate": 3.4919999999999998e-06, + "loss": 0.0002, + "num_tokens": 52405769.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 126.46875, + "completions/mean_terminated_length": 126.46875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.26053816837972055, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.010966183937853202, + "learning_rate": 3.49e-06, + "loss": 0.0004, + "num_tokens": 52420728.0, + "reward": 3.8594045639038086, + "reward_std": 0.2701056897640228, + "rewards/reward_fn/mean": 3.8594045639038086, + "rewards/reward_fn/std": 0.2701056897640228, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 56.8125, + "completions/mean_terminated_length": 56.8125, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.2606536551564846, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.65625, + "kl": 0.012464509669371182, + "learning_rate": 3.488e-06, + "loss": 0.0005, + "num_tokens": 52437938.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 104.25, + "completions/mean_terminated_length": 104.25, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.2607691419332486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029296875, + "kl": 0.002308310165972216, + "learning_rate": 3.486e-06, + "loss": 0.0001, + "num_tokens": 52460442.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 161.09375, + "completions/mean_terminated_length": 161.09375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2608846287100127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.011343520527589135, + "learning_rate": 3.484e-06, + "loss": 0.0005, + "num_tokens": 52490813.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 67.03125, + "completions/mean_terminated_length": 67.03125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.26100011548677676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.010319781576981768, + "learning_rate": 3.482e-06, + "loss": 0.0004, + "num_tokens": 52506590.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 305.40625, + "completions/mean_terminated_length": 305.40625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.2611156022635408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.011691552717820741, + "learning_rate": 3.4799999999999997e-06, + "loss": 0.0005, + "num_tokens": 52532299.0, + "reward": 2.889003276824951, + "reward_std": 0.3642488420009613, + "rewards/reward_fn/mean": 2.889003276824951, + "rewards/reward_fn/std": 0.3642488121986389, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 97.96875, + "completions/mean_terminated_length": 97.96875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.2612310890403049, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.011047196101571899, + "learning_rate": 3.478e-06, + "loss": 0.0004, + "num_tokens": 52553546.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 132.21875, + "completions/mean_terminated_length": 132.21875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.26134657581706894, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.011976150432019494, + "learning_rate": 3.476e-06, + "loss": 0.0005, + "num_tokens": 52581937.0, + "reward": 3.106541633605957, + "reward_std": 0.1720605343580246, + "rewards/reward_fn/mean": 3.106541633605957, + "rewards/reward_fn/std": 0.1720605492591858, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 193.59375, + "completions/mean_terminated_length": 193.59375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.26146206259383303, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.012876757347839884, + "learning_rate": 3.474e-06, + "loss": 0.0005, + "num_tokens": 52599172.0, + "reward": 3.8472375869750977, + "reward_std": 0.45894283056259155, + "rewards/reward_fn/mean": 3.8472375869750977, + "rewards/reward_fn/std": 0.45894286036491394, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 155.53125, + "completions/mean_terminated_length": 155.53125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2615775493705971, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.006425789433706086, + "learning_rate": 3.472e-06, + "loss": 0.0003, + "num_tokens": 52624309.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 114.15625, + "completions/mean_terminated_length": 114.15625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.2616930361473611, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.21875, + "kl": 0.011259111521212617, + "learning_rate": 3.4700000000000002e-06, + "loss": 0.0005, + "num_tokens": 52639834.0, + "reward": 3.9321281909942627, + "reward_std": 0.3839409649372101, + "rewards/reward_fn/mean": 3.9321281909942627, + "rewards/reward_fn/std": 0.3839409649372101, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 64.5625, + "completions/mean_terminated_length": 64.5625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.2618085229241252, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.75, + "kl": 0.004788799296875368, + "learning_rate": 3.4679999999999997e-06, + "loss": 0.0002, + "num_tokens": 52665740.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 162.0, + "completions/mean_terminated_length": 162.0, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.26192400970088925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.01998834362893831, + "learning_rate": 3.466e-06, + "loss": 0.0008, + "num_tokens": 52691660.0, + "reward": 3.981876850128174, + "reward_std": 0.10252036154270172, + "rewards/reward_fn/mean": 3.981876850128174, + "rewards/reward_fn/std": 0.1025203987956047, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 457.40625, + "completions/mean_terminated_length": 457.40625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.2620394964776533, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.022880467324284837, + "learning_rate": 3.4639999999999998e-06, + "loss": 0.0009, + "num_tokens": 52721241.0, + "reward": 3.930159568786621, + "reward_std": 0.3950764238834381, + "rewards/reward_fn/mean": 3.930159568786621, + "rewards/reward_fn/std": 0.39507636427879333, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1073.0, + "completions/max_terminated_length": 1073.0, + "completions/mean_length": 366.21875, + "completions/mean_terminated_length": 366.21875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.2621549832544174, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.012070057200617157, + "learning_rate": 3.462e-06, + "loss": 0.0005, + "num_tokens": 52751968.0, + "reward": 3.9809327125549316, + "reward_std": 0.10786103457212448, + "rewards/reward_fn/mean": 3.9809327125549316, + "rewards/reward_fn/std": 0.10786107182502747, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 206.34375, + "completions/mean_terminated_length": 206.34375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.2622704700311814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.011202663510630373, + "learning_rate": 3.46e-06, + "loss": 0.0004, + "num_tokens": 52781195.0, + "reward": 3.9838576316833496, + "reward_std": 0.09131557494401932, + "rewards/reward_fn/mean": 3.9838576316833496, + "rewards/reward_fn/std": 0.09131557494401932, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 146.90625, + "completions/mean_terminated_length": 146.90625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.26238595680794546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.010797307702887338, + "learning_rate": 3.458e-06, + "loss": 0.0004, + "num_tokens": 52803624.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 221.40625, + "completions/mean_terminated_length": 221.40625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.26250144358470956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.010515707079321146, + "learning_rate": 3.4559999999999996e-06, + "loss": 0.0004, + "num_tokens": 52832533.0, + "reward": 3.6204562187194824, + "reward_std": 0.43082767724990845, + "rewards/reward_fn/mean": 3.6204562187194824, + "rewards/reward_fn/std": 0.43082767724990845, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 378.875, + "completions/mean_terminated_length": 378.875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.2626169303614736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033203125, + "kl": 0.008423585379205178, + "learning_rate": 3.454e-06, + "loss": 0.0003, + "num_tokens": 52859857.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 319.75, + "completions/mean_terminated_length": 319.75, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.2627324171382377, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.01237796837813221, + "learning_rate": 3.4519999999999997e-06, + "loss": 0.0005, + "num_tokens": 52885097.0, + "reward": 3.7593209743499756, + "reward_std": 0.4247325360774994, + "rewards/reward_fn/mean": 3.7593209743499756, + "rewards/reward_fn/std": 0.424732506275177, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 236.5, + "completions/mean_terminated_length": 236.5, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.26284790391500173, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.015001280378783122, + "learning_rate": 3.45e-06, + "loss": 0.0006, + "num_tokens": 52906425.0, + "reward": 3.967637062072754, + "reward_std": 0.18307293951511383, + "rewards/reward_fn/mean": 3.967637062072754, + "rewards/reward_fn/std": 0.18307293951511383, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 140.09375, + "completions/mean_terminated_length": 140.09375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.2629633906917658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.005685918764356757, + "learning_rate": 3.448e-06, + "loss": 0.0002, + "num_tokens": 52933244.0, + "reward": 3.618168354034424, + "reward_std": 0.02311564050614834, + "rewards/reward_fn/mean": 3.618168354034424, + "rewards/reward_fn/std": 0.02311566099524498, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 72.78125, + "completions/mean_terminated_length": 72.78125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.26307887746852987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.006273297061852645, + "learning_rate": 3.446e-06, + "loss": 0.0003, + "num_tokens": 52962325.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 223.96875, + "completions/mean_terminated_length": 223.96875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.2631943642452939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.00858627377601806, + "learning_rate": 3.444e-06, + "loss": 0.0003, + "num_tokens": 52985364.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 192.1875, + "completions/mean_terminated_length": 192.1875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.26330985102205795, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.008591179823270068, + "learning_rate": 3.442e-06, + "loss": 0.0003, + "num_tokens": 53009882.0, + "reward": 2.7879021167755127, + "reward_std": 0.21321041882038116, + "rewards/reward_fn/mean": 2.7879021167755127, + "rewards/reward_fn/std": 0.21321037411689758, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 322.4375, + "completions/mean_terminated_length": 322.4375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.26342533779882205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.01581183598318603, + "learning_rate": 3.4399999999999997e-06, + "loss": 0.0006, + "num_tokens": 53031784.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 117.375, + "completions/mean_terminated_length": 117.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.2635408245755861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.00660376330051804, + "learning_rate": 3.438e-06, + "loss": 0.0003, + "num_tokens": 53058772.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 156.3125, + "completions/mean_terminated_length": 156.3125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.2636563113523502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.011573249794309959, + "learning_rate": 3.436e-06, + "loss": 0.0005, + "num_tokens": 53084574.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 347.125, + "completions/mean_terminated_length": 347.125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.2637717981291142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.010277443056111224, + "learning_rate": 3.434e-06, + "loss": 0.0004, + "num_tokens": 53118946.0, + "reward": 3.9699487686157227, + "reward_std": 0.1699962466955185, + "rewards/reward_fn/mean": 3.9699487686157227, + "rewards/reward_fn/std": 0.1699962317943573, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 161.78125, + "completions/mean_terminated_length": 161.78125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.26388728490587826, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.328125, + "kl": 0.014758660268853419, + "learning_rate": 3.432e-06, + "loss": 0.0006, + "num_tokens": 53131803.0, + "reward": 3.355914831161499, + "reward_std": 0.10054007917642593, + "rewards/reward_fn/mean": 3.355914831161499, + "rewards/reward_fn/std": 0.10054005682468414, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 60.875, + "completions/mean_terminated_length": 60.875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.26400277168264236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.004739995045383694, + "learning_rate": 3.43e-06, + "loss": 0.0002, + "num_tokens": 53153847.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 229.5625, + "completions/mean_terminated_length": 229.5625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.2641182584594064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.010941828848444857, + "learning_rate": 3.4279999999999996e-06, + "loss": 0.0004, + "num_tokens": 53173577.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 94.78125, + "completions/mean_terminated_length": 94.78125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.26423374523617044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.012071424607711378, + "learning_rate": 3.426e-06, + "loss": 0.0005, + "num_tokens": 53188514.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 139.03125, + "completions/mean_terminated_length": 139.03125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.26434923201293453, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.0121972766937688, + "learning_rate": 3.4239999999999997e-06, + "loss": 0.0005, + "num_tokens": 53200963.0, + "reward": 3.8664815425872803, + "reward_std": 0.3605958819389343, + "rewards/reward_fn/mean": 3.8664815425872803, + "rewards/reward_fn/std": 0.36059585213661194, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 106.53125, + "completions/mean_terminated_length": 106.53125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.26446471878969857, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.010677036552806385, + "learning_rate": 3.422e-06, + "loss": 0.0004, + "num_tokens": 53229268.0, + "reward": 3.8633413314819336, + "reward_std": 0.5378085374832153, + "rewards/reward_fn/mean": 3.8633413314819336, + "rewards/reward_fn/std": 0.5378084778785706, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 214.625, + "completions/mean_terminated_length": 214.625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.26458020556646267, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.020335509645519778, + "learning_rate": 3.42e-06, + "loss": 0.0008, + "num_tokens": 53258376.0, + "reward": 3.621739387512207, + "reward_std": 0.6711610555648804, + "rewards/reward_fn/mean": 3.621739387512207, + "rewards/reward_fn/std": 0.6711610555648804, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 155.28125, + "completions/mean_terminated_length": 155.28125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.2646956923432267, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.010039814391348045, + "learning_rate": 3.418e-06, + "loss": 0.0004, + "num_tokens": 53284433.0, + "reward": 3.8835017681121826, + "reward_std": 0.4209463596343994, + "rewards/reward_fn/mean": 3.8835017681121826, + "rewards/reward_fn/std": 0.420946329832077, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 108.6875, + "completions/mean_terminated_length": 108.6875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.26481117911999075, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.010100948209583294, + "learning_rate": 3.4159999999999996e-06, + "loss": 0.0004, + "num_tokens": 53310407.0, + "reward": 3.97750186920166, + "reward_std": 0.127269446849823, + "rewards/reward_fn/mean": 3.97750186920166, + "rewards/reward_fn/std": 0.127269446849823, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 145.15625, + "completions/mean_terminated_length": 145.15625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.26492666589675484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.013935007315012626, + "learning_rate": 3.414e-06, + "loss": 0.0006, + "num_tokens": 53338220.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 83.1875, + "completions/mean_terminated_length": 83.1875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.2650421526735189, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.004215418943203986, + "learning_rate": 3.4119999999999997e-06, + "loss": 0.0002, + "num_tokens": 53353106.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 174.65625, + "completions/mean_terminated_length": 174.65625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.2651576394502829, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.01156386912043672, + "learning_rate": 3.41e-06, + "loss": 0.0005, + "num_tokens": 53374951.0, + "reward": 3.89572811126709, + "reward_std": 0.33241209387779236, + "rewards/reward_fn/mean": 3.89572811126709, + "rewards/reward_fn/std": 0.33241209387779236, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 230.0625, + "completions/mean_terminated_length": 230.0625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.265273126227047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.007707302112976322, + "learning_rate": 3.408e-06, + "loss": 0.0003, + "num_tokens": 53393929.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 327.40625, + "completions/mean_terminated_length": 327.40625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.26538861300381106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.013696096168132499, + "learning_rate": 3.406e-06, + "loss": 0.0005, + "num_tokens": 53425494.0, + "reward": 2.7720274925231934, + "reward_std": 0.9512789249420166, + "rewards/reward_fn/mean": 2.7720274925231934, + "rewards/reward_fn/std": 0.9512788653373718, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 254.8125, + "completions/mean_terminated_length": 254.8125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.2655040997805751, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.013096182665321976, + "learning_rate": 3.404e-06, + "loss": 0.0005, + "num_tokens": 53444816.0, + "reward": 3.9303181171417236, + "reward_std": 0.3941800892353058, + "rewards/reward_fn/mean": 3.9303181171417236, + "rewards/reward_fn/std": 0.39418014883995056, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 103.0, + "completions/mean_length": 68.71875, + "completions/mean_terminated_length": 68.71875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.2656195865573392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.004219312097120564, + "learning_rate": 3.4019999999999998e-06, + "loss": 0.0002, + "num_tokens": 53468071.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 139.34375, + "completions/mean_terminated_length": 139.34375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.26573507333410323, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.010271752908010967, + "learning_rate": 3.3999999999999996e-06, + "loss": 0.0004, + "num_tokens": 53496114.0, + "reward": 3.5894956588745117, + "reward_std": 0.5056686997413635, + "rewards/reward_fn/mean": 3.5894956588745117, + "rewards/reward_fn/std": 0.5056686997413635, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 79.6875, + "completions/mean_terminated_length": 79.6875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.26585056011086733, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.011304166975605767, + "learning_rate": 3.398e-06, + "loss": 0.0005, + "num_tokens": 53523400.0, + "reward": 3.9758846759796143, + "reward_std": 0.13641680777072906, + "rewards/reward_fn/mean": 3.9758846759796143, + "rewards/reward_fn/std": 0.13641679286956787, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 165.0, + "completions/mean_terminated_length": 165.0, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.26596604688763137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.011247144604567438, + "learning_rate": 3.3959999999999998e-06, + "loss": 0.0004, + "num_tokens": 53552104.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 111.625, + "completions/mean_terminated_length": 111.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.2660815336643954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032470703125, + "kl": 0.0028473109887272585, + "learning_rate": 3.394e-06, + "loss": 0.0001, + "num_tokens": 53569788.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 152.96875, + "completions/mean_terminated_length": 152.96875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.2661970204411595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.013226138049503788, + "learning_rate": 3.392e-06, + "loss": 0.0005, + "num_tokens": 53597915.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 135.875, + "completions/mean_terminated_length": 135.875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.26631250721792354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.00970672930998262, + "learning_rate": 3.39e-06, + "loss": 0.0004, + "num_tokens": 53623479.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 181.8125, + "completions/mean_terminated_length": 181.8125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.2664279939946876, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.02028043702011928, + "learning_rate": 3.3879999999999996e-06, + "loss": 0.0008, + "num_tokens": 53645617.0, + "reward": 3.254685163497925, + "reward_std": 0.40884163975715637, + "rewards/reward_fn/mean": 3.254685163497925, + "rewards/reward_fn/std": 0.40884166955947876, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 69.53125, + "completions/mean_terminated_length": 69.53125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.2665434807714517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.00522729263502697, + "learning_rate": 3.386e-06, + "loss": 0.0002, + "num_tokens": 53661282.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 166.53125, + "completions/mean_terminated_length": 166.53125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.2666589675482157, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.020719645486678928, + "learning_rate": 3.3839999999999997e-06, + "loss": 0.0008, + "num_tokens": 53684851.0, + "reward": 3.8871958255767822, + "reward_std": 0.4162246882915497, + "rewards/reward_fn/mean": 3.8871958255767822, + "rewards/reward_fn/std": 0.4162246882915497, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 174.6875, + "completions/mean_terminated_length": 174.6875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.2667744543249798, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.008590045348682906, + "learning_rate": 3.382e-06, + "loss": 0.0003, + "num_tokens": 53718217.0, + "reward": 3.932204484939575, + "reward_std": 0.38350921869277954, + "rewards/reward_fn/mean": 3.932204484939575, + "rewards/reward_fn/std": 0.38350921869277954, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 189.375, + "completions/mean_terminated_length": 189.375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.26688994110174385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037109375, + "kl": 0.006234205306100193, + "learning_rate": 3.38e-06, + "loss": 0.0002, + "num_tokens": 53742293.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 225.8125, + "completions/mean_terminated_length": 225.8125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.2670054278785079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.015398453382658772, + "learning_rate": 3.378e-06, + "loss": 0.0006, + "num_tokens": 53767407.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 60.4375, + "completions/mean_terminated_length": 60.4375, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.267120914655272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.007985376945725875, + "learning_rate": 3.3759999999999995e-06, + "loss": 0.0003, + "num_tokens": 53779485.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 128.0, + "completions/mean_terminated_length": 128.0, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.26723640143203603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.006406890435755486, + "learning_rate": 3.374e-06, + "loss": 0.0003, + "num_tokens": 53807453.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 133.375, + "completions/mean_terminated_length": 133.375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.26735188820880007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.016456608034786768, + "learning_rate": 3.3719999999999996e-06, + "loss": 0.0007, + "num_tokens": 53830953.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 282.84375, + "completions/mean_terminated_length": 282.84375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.26746737498556417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.01016691530821845, + "learning_rate": 3.37e-06, + "loss": 0.0004, + "num_tokens": 53854948.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 266.25, + "completions/mean_terminated_length": 266.25, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.2675828617623282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.010328619136998896, + "learning_rate": 3.3679999999999998e-06, + "loss": 0.0004, + "num_tokens": 53877196.0, + "reward": 2.6963610649108887, + "reward_std": 0.028353827074170113, + "rewards/reward_fn/mean": 2.6963610649108887, + "rewards/reward_fn/std": 0.028353817760944366, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 211.1875, + "completions/mean_terminated_length": 211.1875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.2676983485390923, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.01336886634817347, + "learning_rate": 3.366e-06, + "loss": 0.0005, + "num_tokens": 53904690.0, + "reward": 3.974175453186035, + "reward_std": 0.14608624577522278, + "rewards/reward_fn/mean": 3.974175453186035, + "rewards/reward_fn/std": 0.14608624577522278, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 238.78125, + "completions/mean_terminated_length": 238.78125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.26781383531585634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.01175308354140725, + "learning_rate": 3.364e-06, + "loss": 0.0005, + "num_tokens": 53934763.0, + "reward": 3.1918816566467285, + "reward_std": 0.12709984183311462, + "rewards/reward_fn/mean": 3.1918816566467285, + "rewards/reward_fn/std": 0.12709984183311462, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 268.96875, + "completions/mean_terminated_length": 268.96875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.2679293220926204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.013603473802504595, + "learning_rate": 3.3619999999999997e-06, + "loss": 0.0005, + "num_tokens": 53958250.0, + "reward": 3.933542490005493, + "reward_std": 0.37594079971313477, + "rewards/reward_fn/mean": 3.933542490005493, + "rewards/reward_fn/std": 0.37594079971313477, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 95.0, + "completions/mean_terminated_length": 95.0, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.2680448088693845, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "kl": 0.0145646950986702, + "learning_rate": 3.3599999999999996e-06, + "loss": 0.0006, + "num_tokens": 53983082.0, + "reward": 3.969348669052124, + "reward_std": 0.17339010536670685, + "rewards/reward_fn/mean": 3.969348669052124, + "rewards/reward_fn/std": 0.17339010536670685, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 182.4375, + "completions/mean_terminated_length": 182.4375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.2681602956461485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.019216304150177166, + "learning_rate": 3.358e-06, + "loss": 0.0008, + "num_tokens": 54012664.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 68.71875, + "completions/mean_terminated_length": 68.71875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.26827578242291256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.005218111618887633, + "learning_rate": 3.3559999999999997e-06, + "loss": 0.0002, + "num_tokens": 54026127.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 399.03125, + "completions/mean_terminated_length": 399.03125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.26839126919967665, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.012154204421676695, + "learning_rate": 3.354e-06, + "loss": 0.0005, + "num_tokens": 54061488.0, + "reward": 3.419069290161133, + "reward_std": 0.5243944525718689, + "rewards/reward_fn/mean": 3.419069290161133, + "rewards/reward_fn/std": 0.5243944525718689, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 285.90625, + "completions/mean_terminated_length": 285.90625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.2685067559764407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.010233825829345733, + "learning_rate": 3.352e-06, + "loss": 0.0004, + "num_tokens": 54082797.0, + "reward": 3.7166473865509033, + "reward_std": 0.7617855072021484, + "rewards/reward_fn/mean": 3.7166473865509033, + "rewards/reward_fn/std": 0.7617854475975037, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 174.15625, + "completions/mean_terminated_length": 174.15625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.26862224275320473, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.020922835698002018, + "learning_rate": 3.35e-06, + "loss": 0.0008, + "num_tokens": 54097138.0, + "reward": 3.290692090988159, + "reward_std": 0.3270396292209625, + "rewards/reward_fn/mean": 3.290692090988159, + "rewards/reward_fn/std": 0.32703959941864014, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 130.15625, + "completions/mean_terminated_length": 130.15625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.2687377295299688, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.012530896026873961, + "learning_rate": 3.3479999999999995e-06, + "loss": 0.0005, + "num_tokens": 54122711.0, + "reward": 3.946690559387207, + "reward_std": 0.20982566475868225, + "rewards/reward_fn/mean": 3.946690559387207, + "rewards/reward_fn/std": 0.20982560515403748, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 150.25, + "completions/mean_terminated_length": 150.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.26885321630673287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.007292170856089797, + "learning_rate": 3.346e-06, + "loss": 0.0003, + "num_tokens": 54144159.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 187.84375, + "completions/mean_terminated_length": 187.84375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.26896870308349696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.00694198968631099, + "learning_rate": 3.3439999999999997e-06, + "loss": 0.0003, + "num_tokens": 54163354.0, + "reward": 3.961745262145996, + "reward_std": 0.21640081703662872, + "rewards/reward_fn/mean": 3.961745262145996, + "rewards/reward_fn/std": 0.21640083193778992, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 267.4375, + "completions/mean_terminated_length": 267.4375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.269084189860261, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.006874401311506517, + "learning_rate": 3.342e-06, + "loss": 0.0003, + "num_tokens": 54191368.0, + "reward": 3.650200605392456, + "reward_std": 0.38354185223579407, + "rewards/reward_fn/mean": 3.650200605392456, + "rewards/reward_fn/std": 0.3835418224334717, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 76.59375, + "completions/mean_terminated_length": 76.59375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.26919967663702504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.004291855024348479, + "learning_rate": 3.3399999999999998e-06, + "loss": 0.0002, + "num_tokens": 54214779.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 133.71875, + "completions/mean_terminated_length": 133.71875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.26931516341378914, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.013461743023071904, + "learning_rate": 3.338e-06, + "loss": 0.0005, + "num_tokens": 54232690.0, + "reward": 3.862675905227661, + "reward_std": 0.54038006067276, + "rewards/reward_fn/mean": 3.862675905227661, + "rewards/reward_fn/std": 0.5403801202774048, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 214.875, + "completions/mean_terminated_length": 214.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.2694306501905532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.00902456929179607, + "learning_rate": 3.336e-06, + "loss": 0.0004, + "num_tokens": 54253006.0, + "reward": 3.931642532348633, + "reward_std": 0.2195800095796585, + "rewards/reward_fn/mean": 3.931642532348633, + "rewards/reward_fn/std": 0.2195800095796585, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 80.09375, + "completions/mean_terminated_length": 80.09375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.2695461369673172, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.013205913783167489, + "learning_rate": 3.3339999999999998e-06, + "loss": 0.0005, + "num_tokens": 54267345.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 132.96875, + "completions/mean_terminated_length": 132.96875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.2696616237440813, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "kl": 0.016828514213557355, + "learning_rate": 3.3319999999999996e-06, + "loss": 0.0007, + "num_tokens": 54285392.0, + "reward": 3.81170654296875, + "reward_std": 0.332327276468277, + "rewards/reward_fn/mean": 3.81170654296875, + "rewards/reward_fn/std": 0.3323272168636322, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 94.53125, + "completions/mean_terminated_length": 94.53125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.26977711052084535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.007750724111247109, + "learning_rate": 3.33e-06, + "loss": 0.0003, + "num_tokens": 54300257.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.26989259729760945, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.012308136705541983, + "learning_rate": 3.3279999999999997e-06, + "loss": 0.0005, + "num_tokens": 54323557.0, + "reward": 3.5872833728790283, + "reward_std": 0.6865459084510803, + "rewards/reward_fn/mean": 3.5872833728790283, + "rewards/reward_fn/std": 0.6865459084510803, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 177.1875, + "completions/mean_terminated_length": 177.1875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.2700080840743735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.01004307658149628, + "learning_rate": 3.326e-06, + "loss": 0.0004, + "num_tokens": 54340363.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 101.34375, + "completions/mean_terminated_length": 101.34375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.27012357085113753, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.328125, + "kl": 0.011568154412088916, + "learning_rate": 3.324e-06, + "loss": 0.0005, + "num_tokens": 54365814.0, + "reward": 3.7525529861450195, + "reward_std": 0.4021587073802948, + "rewards/reward_fn/mean": 3.7525529861450195, + "rewards/reward_fn/std": 0.4021587073802948, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 121.375, + "completions/mean_terminated_length": 121.375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.2702390576279016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.0074567210031091236, + "learning_rate": 3.3219999999999997e-06, + "loss": 0.0003, + "num_tokens": 54381442.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 106.4375, + "completions/mean_terminated_length": 106.4375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.27035454440466566, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.016971111108432524, + "learning_rate": 3.3199999999999996e-06, + "loss": 0.0007, + "num_tokens": 54400752.0, + "reward": 3.128448486328125, + "reward_std": 0.0843619629740715, + "rewards/reward_fn/mean": 3.128448486328125, + "rewards/reward_fn/std": 0.08436199277639389, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 331.1875, + "completions/mean_terminated_length": 331.1875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.2704700311814297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.010752130881883204, + "learning_rate": 3.318e-06, + "loss": 0.0004, + "num_tokens": 54425014.0, + "reward": 3.700725555419922, + "reward_std": 0.42316514253616333, + "rewards/reward_fn/mean": 3.700725555419922, + "rewards/reward_fn/std": 0.4231651723384857, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 199.90625, + "completions/mean_terminated_length": 199.90625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.2705855179581938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.008974609649158083, + "learning_rate": 3.3159999999999997e-06, + "loss": 0.0004, + "num_tokens": 54449971.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 291.53125, + "completions/mean_terminated_length": 291.53125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.27070100473495784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.010790880478452891, + "learning_rate": 3.314e-06, + "loss": 0.0004, + "num_tokens": 54473956.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 565.1875, + "completions/mean_terminated_length": 517.3547973632812, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.27081649151172194, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.016775483134551905, + "learning_rate": 3.312e-06, + "loss": 0.0007, + "num_tokens": 54503562.0, + "reward": 3.5089476108551025, + "reward_std": 1.0735656023025513, + "rewards/reward_fn/mean": 3.5089476108551025, + "rewards/reward_fn/std": 1.0735656023025513, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 54.5, + "completions/mean_terminated_length": 54.5, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.270931978288486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.005453256373584736, + "learning_rate": 3.31e-06, + "loss": 0.0002, + "num_tokens": 54518618.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 306.96875, + "completions/mean_terminated_length": 306.96875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.27104746506525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0390625, + "kl": 0.008217165188398212, + "learning_rate": 3.3079999999999995e-06, + "loss": 0.0003, + "num_tokens": 54538041.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 80.0625, + "completions/mean_terminated_length": 80.0625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.2711629518420141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.008427005384874064, + "learning_rate": 3.3059999999999998e-06, + "loss": 0.0003, + "num_tokens": 54566363.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 57.5625, + "completions/mean_terminated_length": 57.5625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.27127843861877815, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.484375, + "kl": 0.017779105066438206, + "learning_rate": 3.3039999999999996e-06, + "loss": 0.0007, + "num_tokens": 54582125.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 60.28125, + "completions/mean_terminated_length": 60.28125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.2713939253955422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.004485296882194234, + "learning_rate": 3.302e-06, + "loss": 0.0002, + "num_tokens": 54608310.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 180.8125, + "completions/mean_terminated_length": 180.8125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.2715094121723063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.02060974793857895, + "learning_rate": 3.2999999999999997e-06, + "loss": 0.0008, + "num_tokens": 54626800.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 170.90625, + "completions/mean_terminated_length": 170.90625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.2716248989490703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0546875, + "kl": 0.008208945466321893, + "learning_rate": 3.298e-06, + "loss": 0.0003, + "num_tokens": 54654861.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 132.40625, + "completions/mean_terminated_length": 132.40625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.27174038572583437, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.006418628872779664, + "learning_rate": 3.296e-06, + "loss": 0.0003, + "num_tokens": 54689530.0, + "reward": 3.967902660369873, + "reward_std": 0.10182101279497147, + "rewards/reward_fn/mean": 3.967902660369873, + "rewards/reward_fn/std": 0.10182097554206848, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 80.90625, + "completions/mean_terminated_length": 80.90625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.27185587250259846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.004345435689174337, + "learning_rate": 3.2939999999999997e-06, + "loss": 0.0002, + "num_tokens": 54715159.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 180.21875, + "completions/mean_terminated_length": 180.21875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.2719713592793625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.009832504205405712, + "learning_rate": 3.2919999999999996e-06, + "loss": 0.0004, + "num_tokens": 54745758.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 169.40625, + "completions/mean_terminated_length": 169.40625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.2720868460561266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.01571451719792094, + "learning_rate": 3.29e-06, + "loss": 0.0006, + "num_tokens": 54764843.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 356.96875, + "completions/mean_terminated_length": 356.96875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.27220233283289064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.008810530795017257, + "learning_rate": 3.2879999999999997e-06, + "loss": 0.0004, + "num_tokens": 54791882.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 110.09375, + "completions/mean_terminated_length": 110.09375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.2723178196096547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.005141771041962784, + "learning_rate": 3.286e-06, + "loss": 0.0002, + "num_tokens": 54812941.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 318.28125, + "completions/mean_terminated_length": 318.28125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.27243330638641877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.01973902124154847, + "learning_rate": 3.284e-06, + "loss": 0.0008, + "num_tokens": 54836310.0, + "reward": 3.715700149536133, + "reward_std": 0.7644455432891846, + "rewards/reward_fn/mean": 3.715700149536133, + "rewards/reward_fn/std": 0.7644455432891846, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 191.78125, + "completions/mean_terminated_length": 191.78125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.2725487931631828, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.012899803812615573, + "learning_rate": 3.282e-06, + "loss": 0.0005, + "num_tokens": 54862415.0, + "reward": 3.0139126777648926, + "reward_std": 0.19849829375743866, + "rewards/reward_fn/mean": 3.0139126777648926, + "rewards/reward_fn/std": 0.19849830865859985, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 333.75, + "completions/mean_terminated_length": 333.75, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.27266427993994685, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.013979034658404998, + "learning_rate": 3.2799999999999995e-06, + "loss": 0.0006, + "num_tokens": 54895399.0, + "reward": 3.5518341064453125, + "reward_std": 0.5504558086395264, + "rewards/reward_fn/mean": 3.5518341064453125, + "rewards/reward_fn/std": 0.5504558086395264, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 217.65625, + "completions/mean_terminated_length": 217.65625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.27277976671671095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.010483277692401316, + "learning_rate": 3.2779999999999998e-06, + "loss": 0.0004, + "num_tokens": 54913244.0, + "reward": 3.835783004760742, + "reward_std": 0.5541423559188843, + "rewards/reward_fn/mean": 3.835783004760742, + "rewards/reward_fn/std": 0.5541423559188843, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 210.9375, + "completions/mean_terminated_length": 210.9375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.272895253493475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.008849376506987028, + "learning_rate": 3.2759999999999996e-06, + "loss": 0.0004, + "num_tokens": 54942010.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 70.4375, + "completions/mean_terminated_length": 70.4375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.2730107402702391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.0072915838318294846, + "learning_rate": 3.274e-06, + "loss": 0.0003, + "num_tokens": 54958984.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 210.75, + "completions/mean_terminated_length": 210.75, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.2731262270470031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.00984570498985704, + "learning_rate": 3.2719999999999998e-06, + "loss": 0.0004, + "num_tokens": 54977664.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 101.96875, + "completions/mean_terminated_length": 101.96875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.27324171382376716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.014191870126524009, + "learning_rate": 3.27e-06, + "loss": 0.0006, + "num_tokens": 55002687.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 76.71875, + "completions/mean_terminated_length": 76.71875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.27335720060053126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.009743455208081286, + "learning_rate": 3.2679999999999995e-06, + "loss": 0.0004, + "num_tokens": 55017686.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1044.0, + "completions/max_terminated_length": 1044.0, + "completions/mean_length": 374.625, + "completions/mean_terminated_length": 374.625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.2734726873772953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.01129811150894966, + "learning_rate": 3.2659999999999997e-06, + "loss": 0.0005, + "num_tokens": 55048106.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 80.1875, + "completions/mean_terminated_length": 80.1875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.27358817415405934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.006622630658966955, + "learning_rate": 3.2639999999999996e-06, + "loss": 0.0003, + "num_tokens": 55063056.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 346.59375, + "completions/mean_terminated_length": 346.59375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.27370366093082343, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.009530559218546841, + "learning_rate": 3.262e-06, + "loss": 0.0004, + "num_tokens": 55089219.0, + "reward": 3.9276881217956543, + "reward_std": 0.4090578258037567, + "rewards/reward_fn/mean": 3.9276881217956543, + "rewards/reward_fn/std": 0.4090578258037567, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 180.1875, + "completions/mean_terminated_length": 180.1875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.2738191477075875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041259765625, + "kl": 0.007474729347450193, + "learning_rate": 3.2599999999999997e-06, + "loss": 0.0003, + "num_tokens": 55117033.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 246.375, + "completions/mean_terminated_length": 246.375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.27393463448435157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.009962531134078745, + "learning_rate": 3.258e-06, + "loss": 0.0004, + "num_tokens": 55140597.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 182.65625, + "completions/mean_terminated_length": 182.65625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.2740501212611156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.007834488249500282, + "learning_rate": 3.256e-06, + "loss": 0.0003, + "num_tokens": 55163018.0, + "reward": 3.2781567573547363, + "reward_std": 0.33531805872917175, + "rewards/reward_fn/mean": 3.2781567573547363, + "rewards/reward_fn/std": 0.33531805872917175, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 176.0625, + "completions/mean_terminated_length": 176.0625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.27416560803787965, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.023850684097851627, + "learning_rate": 3.2539999999999997e-06, + "loss": 0.001, + "num_tokens": 55186668.0, + "reward": 3.731735944747925, + "reward_std": 0.3851914703845978, + "rewards/reward_fn/mean": 3.731735944747925, + "rewards/reward_fn/std": 0.3851914405822754, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 209.78125, + "completions/mean_terminated_length": 209.78125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.27428109481464374, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.014704959132359363, + "learning_rate": 3.2519999999999995e-06, + "loss": 0.0006, + "num_tokens": 55209253.0, + "reward": 3.9093761444091797, + "reward_std": 0.4138210117816925, + "rewards/reward_fn/mean": 3.9093761444091797, + "rewards/reward_fn/std": 0.4138210117816925, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 327.03125, + "completions/mean_terminated_length": 327.03125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.2743965815914078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.009441630878427532, + "learning_rate": 3.25e-06, + "loss": 0.0004, + "num_tokens": 55229510.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 131.75, + "completions/mean_terminated_length": 131.75, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.2745120683681718, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.006778864280931884, + "learning_rate": 3.248e-06, + "loss": 0.0003, + "num_tokens": 55262942.0, + "reward": 3.9761710166931152, + "reward_std": 0.1347978264093399, + "rewards/reward_fn/mean": 3.9761710166931152, + "rewards/reward_fn/std": 0.1347978413105011, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 103.25, + "completions/mean_terminated_length": 103.25, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.2746275551449359, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.009520940584479831, + "learning_rate": 3.246e-06, + "loss": 0.0004, + "num_tokens": 55284230.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1119.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 300.9375, + "completions/mean_terminated_length": 300.9375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.27474304192169996, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.012493534886743873, + "learning_rate": 3.244e-06, + "loss": 0.0005, + "num_tokens": 55302116.0, + "reward": 3.6833600997924805, + "reward_std": 0.6301649808883667, + "rewards/reward_fn/mean": 3.6833600997924805, + "rewards/reward_fn/std": 0.6301649808883667, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 91.4375, + "completions/mean_terminated_length": 91.4375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.274858528698464, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.109375, + "kl": 0.01656111162446905, + "learning_rate": 3.242e-06, + "loss": 0.0007, + "num_tokens": 55323378.0, + "reward": 3.9643023014068604, + "reward_std": 0.20193707942962646, + "rewards/reward_fn/mean": 3.9643023014068604, + "rewards/reward_fn/std": 0.20193709433078766, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 68.0, + "completions/mean_terminated_length": 68.0, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.2749740154752281, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.007899233562056907, + "learning_rate": 3.24e-06, + "loss": 0.0003, + "num_tokens": 55341266.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 219.5625, + "completions/mean_terminated_length": 219.5625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.27508950225199214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.011986421683104709, + "learning_rate": 3.2379999999999997e-06, + "loss": 0.0005, + "num_tokens": 55356164.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 213.21875, + "completions/mean_terminated_length": 213.21875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.27520498902875623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.008757865100051276, + "learning_rate": 3.236e-06, + "loss": 0.0004, + "num_tokens": 55385835.0, + "reward": 3.855757236480713, + "reward_std": 0.5676257014274597, + "rewards/reward_fn/mean": 3.855757236480713, + "rewards/reward_fn/std": 0.5676257014274597, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 99.9375, + "completions/mean_terminated_length": 99.9375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.27532047580552027, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.007563063940324355, + "learning_rate": 3.234e-06, + "loss": 0.0003, + "num_tokens": 55412137.0, + "reward": 3.9462904930114746, + "reward_std": 0.2161838561296463, + "rewards/reward_fn/mean": 3.9462904930114746, + "rewards/reward_fn/std": 0.2161838263273239, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 93.0, + "completions/max_terminated_length": 93.0, + "completions/mean_length": 62.1875, + "completions/mean_terminated_length": 62.1875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.2754359625822843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1650390625, + "kl": 0.010346232265874278, + "learning_rate": 3.232e-06, + "loss": 0.0004, + "num_tokens": 55430735.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 111.25, + "completions/mean_terminated_length": 111.25, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.2755514493590484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.004482758062295034, + "learning_rate": 3.23e-06, + "loss": 0.0002, + "num_tokens": 55460247.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 192.5625, + "completions/mean_terminated_length": 192.5625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.27566693613581245, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.00849744297011057, + "learning_rate": 3.2280000000000003e-06, + "loss": 0.0003, + "num_tokens": 55493993.0, + "reward": 3.9668803215026855, + "reward_std": 0.1873537003993988, + "rewards/reward_fn/mean": 3.9668803215026855, + "rewards/reward_fn/std": 0.18735367059707642, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 212.5625, + "completions/mean_terminated_length": 212.5625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.2757824229125765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.010807566686708014, + "learning_rate": 3.2259999999999997e-06, + "loss": 0.0004, + "num_tokens": 55526491.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 114.90625, + "completions/mean_terminated_length": 114.90625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.2758979096893406, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.010782465091324411, + "learning_rate": 3.224e-06, + "loss": 0.0004, + "num_tokens": 55542936.0, + "reward": 3.9777896404266357, + "reward_std": 0.1256408393383026, + "rewards/reward_fn/mean": 3.9777896404266357, + "rewards/reward_fn/std": 0.1256408542394638, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 190.15625, + "completions/mean_terminated_length": 190.15625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.2760133964661046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.011879202575073577, + "learning_rate": 3.222e-06, + "loss": 0.0005, + "num_tokens": 55569213.0, + "reward": 3.8323988914489746, + "reward_std": 0.4915595054626465, + "rewards/reward_fn/mean": 3.8323988914489746, + "rewards/reward_fn/std": 0.4915595054626465, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 103.28125, + "completions/mean_terminated_length": 103.28125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.2761288832428687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.00544250886014197, + "learning_rate": 3.22e-06, + "loss": 0.0002, + "num_tokens": 55585382.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 108.0625, + "completions/mean_terminated_length": 108.0625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.27624437001963276, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.006494753457445768, + "learning_rate": 3.218e-06, + "loss": 0.0003, + "num_tokens": 55613672.0, + "reward": 3.940217971801758, + "reward_std": 0.19225835800170898, + "rewards/reward_fn/mean": 3.940217971801758, + "rewards/reward_fn/std": 0.19225835800170898, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 294.84375, + "completions/mean_terminated_length": 294.84375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.2763598567963968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0390625, + "kl": 0.0071170123119372874, + "learning_rate": 3.216e-06, + "loss": 0.0003, + "num_tokens": 55638435.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 90.3125, + "completions/mean_terminated_length": 90.3125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.2764753435731609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.008130396126944106, + "learning_rate": 3.2139999999999996e-06, + "loss": 0.0003, + "num_tokens": 55666125.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 135.71875, + "completions/mean_terminated_length": 135.71875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.27659083034992493, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0104526292088849, + "learning_rate": 3.212e-06, + "loss": 0.0004, + "num_tokens": 55692228.0, + "reward": 3.03926944732666, + "reward_std": 0.0447387769818306, + "rewards/reward_fn/mean": 3.03926944732666, + "rewards/reward_fn/std": 0.0447387620806694, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 144.6875, + "completions/mean_terminated_length": 144.6875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.27670631712668897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.008604638584074564, + "learning_rate": 3.2099999999999998e-06, + "loss": 0.0003, + "num_tokens": 55712826.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 205.03125, + "completions/mean_terminated_length": 205.03125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.27682180390345307, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.010034568353148643, + "learning_rate": 3.208e-06, + "loss": 0.0004, + "num_tokens": 55732603.0, + "reward": 2.81656551361084, + "reward_std": 0.18743498623371124, + "rewards/reward_fn/mean": 2.81656551361084, + "rewards/reward_fn/std": 0.18743498623371124, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 185.84375, + "completions/mean_terminated_length": 185.84375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.2769372906802171, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.020188888564007357, + "learning_rate": 3.206e-06, + "loss": 0.0008, + "num_tokens": 55758966.0, + "reward": 3.9679319858551025, + "reward_std": 0.18140390515327454, + "rewards/reward_fn/mean": 3.9679319858551025, + "rewards/reward_fn/std": 0.18140387535095215, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 80.625, + "completions/mean_terminated_length": 80.625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.2770527774569812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.007230987799630384, + "learning_rate": 3.204e-06, + "loss": 0.0003, + "num_tokens": 55780490.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 111.15625, + "completions/mean_terminated_length": 111.15625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.27716826423374524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.00476797454575717, + "learning_rate": 3.202e-06, + "loss": 0.0002, + "num_tokens": 55810671.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 239.25, + "completions/mean_terminated_length": 239.25, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.2772837510105093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.012515654671005905, + "learning_rate": 3.2e-06, + "loss": 0.0005, + "num_tokens": 55841367.0, + "reward": 2.9629950523376465, + "reward_std": 0.0988195538520813, + "rewards/reward_fn/mean": 2.9629950523376465, + "rewards/reward_fn/std": 0.09881952404975891, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 93.4375, + "completions/mean_terminated_length": 93.4375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.2773992377872734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.007221208023111103, + "learning_rate": 3.1979999999999997e-06, + "loss": 0.0003, + "num_tokens": 55867173.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 80.34375, + "completions/mean_terminated_length": 80.34375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.2775147245640374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.0048737286360847065, + "learning_rate": 3.196e-06, + "loss": 0.0002, + "num_tokens": 55885520.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 178.9375, + "completions/mean_terminated_length": 178.9375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.27763021134080146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046875, + "kl": 0.006397063996701036, + "learning_rate": 3.194e-06, + "loss": 0.0003, + "num_tokens": 55908110.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 93.96875, + "completions/mean_terminated_length": 93.96875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.27774569811756555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.008513214663253166, + "learning_rate": 3.192e-06, + "loss": 0.0003, + "num_tokens": 55924813.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 109.03125, + "completions/mean_terminated_length": 109.03125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.2778611848943296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.671875, + "kl": 0.016002425443730317, + "learning_rate": 3.19e-06, + "loss": 0.0006, + "num_tokens": 55936462.0, + "reward": 3.623304843902588, + "reward_std": 0.0341368094086647, + "rewards/reward_fn/mean": 3.623304843902588, + "rewards/reward_fn/std": 0.03413679450750351, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 212.5, + "completions/mean_terminated_length": 212.5, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.27797667167109363, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.017845089692855254, + "learning_rate": 3.1880000000000002e-06, + "loss": 0.0007, + "num_tokens": 55965598.0, + "reward": 3.8659865856170654, + "reward_std": 0.31672438979148865, + "rewards/reward_fn/mean": 3.8659865856170654, + "rewards/reward_fn/std": 0.31672441959381104, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 182.25, + "completions/mean_terminated_length": 182.25, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.27809215844785773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.013488210897776298, + "learning_rate": 3.1859999999999997e-06, + "loss": 0.0005, + "num_tokens": 55981670.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 98.0625, + "completions/mean_terminated_length": 98.0625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.27820764522462177, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.014590456368750893, + "learning_rate": 3.184e-06, + "loss": 0.0006, + "num_tokens": 55992968.0, + "reward": 3.4723687171936035, + "reward_std": 0.026338709518313408, + "rewards/reward_fn/mean": 3.4723687171936035, + "rewards/reward_fn/std": 0.0263387281447649, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 187.53125, + "completions/mean_terminated_length": 187.53125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.27832313200138586, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.009766281698830426, + "learning_rate": 3.1819999999999998e-06, + "loss": 0.0004, + "num_tokens": 56016217.0, + "reward": 3.7307214736938477, + "reward_std": 0.4381255507469177, + "rewards/reward_fn/mean": 3.7307214736938477, + "rewards/reward_fn/std": 0.4381255507469177, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 222.8125, + "completions/mean_terminated_length": 222.8125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.2784386187781499, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.014470525304204784, + "learning_rate": 3.18e-06, + "loss": 0.0006, + "num_tokens": 56039251.0, + "reward": 3.480027437210083, + "reward_std": 0.5070092082023621, + "rewards/reward_fn/mean": 3.480027437210083, + "rewards/reward_fn/std": 0.5070092082023621, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 200.21875, + "completions/mean_terminated_length": 200.21875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.27855410555491394, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.013290805072756484, + "learning_rate": 3.178e-06, + "loss": 0.0005, + "num_tokens": 56066842.0, + "reward": 3.6290619373321533, + "reward_std": 0.45987364649772644, + "rewards/reward_fn/mean": 3.6290619373321533, + "rewards/reward_fn/std": 0.45987361669540405, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 171.59375, + "completions/mean_terminated_length": 171.59375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.27866959233167804, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.012645522147067823, + "learning_rate": 3.176e-06, + "loss": 0.0005, + "num_tokens": 56085037.0, + "reward": 3.4385623931884766, + "reward_std": 0.5035458207130432, + "rewards/reward_fn/mean": 3.4385623931884766, + "rewards/reward_fn/std": 0.5035458207130432, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 223.8125, + "completions/mean_terminated_length": 223.8125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.2787850791084421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.008796795867965557, + "learning_rate": 3.1739999999999996e-06, + "loss": 0.0004, + "num_tokens": 56107303.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 109.5, + "completions/mean_terminated_length": 109.5, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.2789005658852061, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "kl": 0.011662720011372585, + "learning_rate": 3.172e-06, + "loss": 0.0005, + "num_tokens": 56125335.0, + "reward": 3.9275670051574707, + "reward_std": 0.22900140285491943, + "rewards/reward_fn/mean": 3.9275670051574707, + "rewards/reward_fn/std": 0.22900140285491943, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 194.03125, + "completions/mean_terminated_length": 194.03125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.2790160526619702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.01589282111672219, + "learning_rate": 3.1699999999999997e-06, + "loss": 0.0006, + "num_tokens": 56153816.0, + "reward": 3.479945659637451, + "reward_std": 0.48003530502319336, + "rewards/reward_fn/mean": 3.479945659637451, + "rewards/reward_fn/std": 0.48003530502319336, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 182.65625, + "completions/mean_terminated_length": 182.65625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.27913153943873426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.010879661167564336, + "learning_rate": 3.168e-06, + "loss": 0.0004, + "num_tokens": 56182317.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 176.21875, + "completions/mean_terminated_length": 176.21875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.27924702621549835, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.02205071281059645, + "learning_rate": 3.166e-06, + "loss": 0.0009, + "num_tokens": 56212820.0, + "reward": 3.729573965072632, + "reward_std": 0.41100576519966125, + "rewards/reward_fn/mean": 3.729573965072632, + "rewards/reward_fn/std": 0.41100573539733887, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 293.6875, + "completions/mean_terminated_length": 293.6875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.2793625129922624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.010177790696616285, + "learning_rate": 3.164e-06, + "loss": 0.0004, + "num_tokens": 56237194.0, + "reward": 3.9695792198181152, + "reward_std": 0.17208537459373474, + "rewards/reward_fn/mean": 3.9695792198181152, + "rewards/reward_fn/std": 0.17208537459373474, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 376.96875, + "completions/mean_terminated_length": 376.96875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.27947799976902643, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.012578223351738416, + "learning_rate": 3.162e-06, + "loss": 0.0005, + "num_tokens": 56267049.0, + "reward": 3.9324538707733154, + "reward_std": 0.26689207553863525, + "rewards/reward_fn/mean": 3.9324538707733154, + "rewards/reward_fn/std": 0.26689210534095764, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 77.96875, + "completions/mean_terminated_length": 77.96875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.2795934865457905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.007093051048286725, + "learning_rate": 3.16e-06, + "loss": 0.0003, + "num_tokens": 56281000.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 111.21875, + "completions/mean_terminated_length": 111.21875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.27970897332255457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.010055286991700996, + "learning_rate": 3.1579999999999997e-06, + "loss": 0.0004, + "num_tokens": 56296335.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 104.65625, + "completions/mean_terminated_length": 104.65625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.2798244600993186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.162109375, + "kl": 0.01745575249151443, + "learning_rate": 3.156e-06, + "loss": 0.0007, + "num_tokens": 56314340.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 253.53125, + "completions/mean_terminated_length": 253.53125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.2799399468760827, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.027522149146534503, + "learning_rate": 3.1539999999999998e-06, + "loss": 0.0011, + "num_tokens": 56346581.0, + "reward": 3.687460422515869, + "reward_std": 0.6401947736740112, + "rewards/reward_fn/mean": 3.687460422515869, + "rewards/reward_fn/std": 0.6401947140693665, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 231.625, + "completions/mean_terminated_length": 231.625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.28005543365284674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.010107926376804244, + "learning_rate": 3.152e-06, + "loss": 0.0004, + "num_tokens": 56366729.0, + "reward": 3.855496644973755, + "reward_std": 0.5686783194541931, + "rewards/reward_fn/mean": 3.855496644973755, + "rewards/reward_fn/std": 0.5686782598495483, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 116.0, + "completions/mean_terminated_length": 116.0, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.28017092042961084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.010256397115881555, + "learning_rate": 3.15e-06, + "loss": 0.0004, + "num_tokens": 56387721.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 387.375, + "completions/mean_terminated_length": 387.375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.2802864072063749, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.011088983548688702, + "learning_rate": 3.148e-06, + "loss": 0.0004, + "num_tokens": 56424149.0, + "reward": 3.714299201965332, + "reward_std": 0.7680469751358032, + "rewards/reward_fn/mean": 3.714299201965332, + "rewards/reward_fn/std": 0.768047034740448, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 179.59375, + "completions/mean_terminated_length": 179.59375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.2804018939831389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.00922622176585719, + "learning_rate": 3.1459999999999996e-06, + "loss": 0.0004, + "num_tokens": 56442216.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 193.09375, + "completions/mean_terminated_length": 193.09375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.280517380759903, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.00979059208475519, + "learning_rate": 3.144e-06, + "loss": 0.0004, + "num_tokens": 56470859.0, + "reward": 3.985771417617798, + "reward_std": 0.08048927783966064, + "rewards/reward_fn/mean": 3.985771417617798, + "rewards/reward_fn/std": 0.08048927038908005, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 175.125, + "completions/mean_terminated_length": 175.125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.28063286753666705, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "kl": 0.020691643279860727, + "learning_rate": 3.1419999999999997e-06, + "loss": 0.0008, + "num_tokens": 56499375.0, + "reward": 3.8637914657592773, + "reward_std": 0.4314402639865875, + "rewards/reward_fn/mean": 3.8637914657592773, + "rewards/reward_fn/std": 0.4314403235912323, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 315.46875, + "completions/mean_terminated_length": 315.46875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.2807483543134311, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.010695676304749213, + "learning_rate": 3.14e-06, + "loss": 0.0004, + "num_tokens": 56528702.0, + "reward": 3.7522101402282715, + "reward_std": 0.4107814133167267, + "rewards/reward_fn/mean": 3.7522101402282715, + "rewards/reward_fn/std": 0.4107814133167267, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 257.8125, + "completions/mean_terminated_length": 257.8125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.2808638410901952, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.016009466227842495, + "learning_rate": 3.138e-06, + "loss": 0.0006, + "num_tokens": 56559224.0, + "reward": 3.402031898498535, + "reward_std": 0.5528542399406433, + "rewards/reward_fn/mean": 3.402031898498535, + "rewards/reward_fn/std": 0.5528542995452881, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 247.5, + "completions/mean_terminated_length": 247.5, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.2809793278669592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.008571544720325619, + "learning_rate": 3.136e-06, + "loss": 0.0003, + "num_tokens": 56577704.0, + "reward": 3.930878162384033, + "reward_std": 0.39101284742355347, + "rewards/reward_fn/mean": 3.930878162384033, + "rewards/reward_fn/std": 0.3910128176212311, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 239.90625, + "completions/mean_terminated_length": 239.90625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.28109481464372327, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.012151090617408045, + "learning_rate": 3.1339999999999996e-06, + "loss": 0.0005, + "num_tokens": 56598149.0, + "reward": 3.9295740127563477, + "reward_std": 0.39838987588882446, + "rewards/reward_fn/mean": 3.9295740127563477, + "rewards/reward_fn/std": 0.39838990569114685, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 219.625, + "completions/mean_terminated_length": 219.625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.28121030142048736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.019473565655061975, + "learning_rate": 3.132e-06, + "loss": 0.0008, + "num_tokens": 56616473.0, + "reward": 3.3856372833251953, + "reward_std": 0.5301838517189026, + "rewards/reward_fn/mean": 3.3856372833251953, + "rewards/reward_fn/std": 0.5301837921142578, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 256.4375, + "completions/mean_terminated_length": 256.4375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.2813257881972514, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.011595070202019997, + "learning_rate": 3.1299999999999997e-06, + "loss": 0.0005, + "num_tokens": 56647815.0, + "reward": 3.5181009769439697, + "reward_std": 0.9385389089584351, + "rewards/reward_fn/mean": 3.5181009769439697, + "rewards/reward_fn/std": 0.9385389089584351, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 175.03125, + "completions/mean_terminated_length": 175.03125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.2814412749740155, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.007791429714416154, + "learning_rate": 3.128e-06, + "loss": 0.0003, + "num_tokens": 56676680.0, + "reward": 2.7690412998199463, + "reward_std": 0.047286469489336014, + "rewards/reward_fn/mean": 2.7690412998199463, + "rewards/reward_fn/std": 0.04728645458817482, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 100.90625, + "completions/mean_terminated_length": 100.90625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.28155676175077954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.005966350909147877, + "learning_rate": 3.126e-06, + "loss": 0.0002, + "num_tokens": 56692037.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 357.9375, + "completions/mean_terminated_length": 357.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.2816722485275436, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.008153597766067833, + "learning_rate": 3.124e-06, + "loss": 0.0003, + "num_tokens": 56718723.0, + "reward": 3.9252512454986572, + "reward_std": 0.42284268140792847, + "rewards/reward_fn/mean": 3.9252512454986572, + "rewards/reward_fn/std": 0.42284268140792847, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 169.71875, + "completions/mean_terminated_length": 169.71875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.2817877353043077, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.007798905982781434, + "learning_rate": 3.122e-06, + "loss": 0.0003, + "num_tokens": 56746618.0, + "reward": 2.7128443717956543, + "reward_std": 0.037108588963747025, + "rewards/reward_fn/mean": 2.7128443717956543, + "rewards/reward_fn/std": 0.037108611315488815, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 202.03125, + "completions/mean_terminated_length": 202.03125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.2819032220810717, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.012866146076703444, + "learning_rate": 3.1199999999999998e-06, + "loss": 0.0005, + "num_tokens": 56774907.0, + "reward": 3.810595989227295, + "reward_std": 0.4501020610332489, + "rewards/reward_fn/mean": 3.810595989227295, + "rewards/reward_fn/std": 0.4501020014286041, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 138.40625, + "completions/mean_terminated_length": 138.40625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.28201870885783575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.013867341272998601, + "learning_rate": 3.1179999999999996e-06, + "loss": 0.0006, + "num_tokens": 56801096.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 71.59375, + "completions/mean_terminated_length": 71.59375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.28213419563459985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1845703125, + "kl": 0.011935102884308435, + "learning_rate": 3.116e-06, + "loss": 0.0005, + "num_tokens": 56829371.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 291.28125, + "completions/mean_terminated_length": 291.28125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.2822496824113639, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.010959404753521085, + "learning_rate": 3.1139999999999997e-06, + "loss": 0.0004, + "num_tokens": 56850948.0, + "reward": 3.8406052589416504, + "reward_std": 0.4833778738975525, + "rewards/reward_fn/mean": 3.8406052589416504, + "rewards/reward_fn/std": 0.4833778738975525, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 147.53125, + "completions/mean_terminated_length": 147.53125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.282365169188128, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.0103683744018781, + "learning_rate": 3.112e-06, + "loss": 0.0004, + "num_tokens": 56874197.0, + "reward": 3.2061331272125244, + "reward_std": 0.05452845245599747, + "rewards/reward_fn/mean": 3.2061331272125244, + "rewards/reward_fn/std": 0.054528433829545975, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 142.53125, + "completions/mean_terminated_length": 142.53125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.282480655964892, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.018781607475830242, + "learning_rate": 3.11e-06, + "loss": 0.0008, + "num_tokens": 56896838.0, + "reward": 3.9377565383911133, + "reward_std": 0.2451355904340744, + "rewards/reward_fn/mean": 3.9377565383911133, + "rewards/reward_fn/std": 0.24513556063175201, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 145.15625, + "completions/mean_terminated_length": 145.15625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.28259614274165606, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.009568286608555354, + "learning_rate": 3.108e-06, + "loss": 0.0004, + "num_tokens": 56918379.0, + "reward": 3.130316734313965, + "reward_std": 0.093386709690094, + "rewards/reward_fn/mean": 3.130316734313965, + "rewards/reward_fn/std": 0.09338672459125519, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 347.78125, + "completions/mean_terminated_length": 347.78125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.28271162951842016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.008196005823265295, + "learning_rate": 3.1059999999999996e-06, + "loss": 0.0003, + "num_tokens": 56945508.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 139.375, + "completions/mean_terminated_length": 139.375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.2828271162951842, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.01041700794303324, + "learning_rate": 3.104e-06, + "loss": 0.0004, + "num_tokens": 56967568.0, + "reward": 3.922456741333008, + "reward_std": 0.24575269222259521, + "rewards/reward_fn/mean": 3.922456741333008, + "rewards/reward_fn/std": 0.2457527369260788, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 190.34375, + "completions/mean_terminated_length": 190.34375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.28294260307194824, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.024303817859617993, + "learning_rate": 3.1019999999999997e-06, + "loss": 0.001, + "num_tokens": 56986427.0, + "reward": 3.8497767448425293, + "reward_std": 0.3583700358867645, + "rewards/reward_fn/mean": 3.8497767448425293, + "rewards/reward_fn/std": 0.3583700954914093, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 147.375, + "completions/mean_terminated_length": 147.375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.28305808984871234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.007977922778081847, + "learning_rate": 3.1e-06, + "loss": 0.0003, + "num_tokens": 57015303.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 123.5625, + "completions/mean_terminated_length": 123.5625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.2831735766254764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.009748876676894724, + "learning_rate": 3.098e-06, + "loss": 0.0004, + "num_tokens": 57042361.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 92.6875, + "completions/mean_terminated_length": 92.6875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.28328906340224047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.007181107590440661, + "learning_rate": 3.096e-06, + "loss": 0.0003, + "num_tokens": 57061423.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 161.03125, + "completions/mean_terminated_length": 161.03125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.2834045501790045, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.020891766689601354, + "learning_rate": 3.0939999999999995e-06, + "loss": 0.0008, + "num_tokens": 57084720.0, + "reward": 3.938718557357788, + "reward_std": 0.20317722856998444, + "rewards/reward_fn/mean": 3.938718557357788, + "rewards/reward_fn/std": 0.20317719876766205, + "step": 2454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 362.0, + "completions/mean_terminated_length": 362.0, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.28352003695576855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.009082495686016046, + "learning_rate": 3.0919999999999998e-06, + "loss": 0.0004, + "num_tokens": 57112464.0, + "reward": 3.9317710399627686, + "reward_std": 0.3859613239765167, + "rewards/reward_fn/mean": 3.9317710399627686, + "rewards/reward_fn/std": 0.3859613239765167, + "step": 2455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 121.59375, + "completions/mean_terminated_length": 121.59375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.28363552373253265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.009825878601986915, + "learning_rate": 3.0899999999999996e-06, + "loss": 0.0004, + "num_tokens": 57135523.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 67.78125, + "completions/mean_terminated_length": 67.78125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.2837510105092967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.007501418291212758, + "learning_rate": 3.088e-06, + "loss": 0.0003, + "num_tokens": 57151260.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 227.25, + "completions/mean_terminated_length": 227.25, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.2838664972860607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.009291626542108133, + "learning_rate": 3.0859999999999998e-06, + "loss": 0.0004, + "num_tokens": 57173892.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 493.28125, + "completions/mean_terminated_length": 493.28125, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.2839819840628248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7734375, + "kl": 0.009343770172563381, + "learning_rate": 3.084e-06, + "loss": 0.0004, + "num_tokens": 57201037.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 166.625, + "completions/mean_terminated_length": 166.625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.28409747083958886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.011926915511139669, + "learning_rate": 3.082e-06, + "loss": 0.0005, + "num_tokens": 57214465.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 142.0, + "completions/mean_terminated_length": 142.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.2842129576163529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.006910645810421556, + "learning_rate": 3.0799999999999997e-06, + "loss": 0.0003, + "num_tokens": 57242465.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 119.375, + "completions/mean_terminated_length": 119.375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.284328444393117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1767578125, + "kl": 0.01334578142996179, + "learning_rate": 3.0779999999999996e-06, + "loss": 0.0005, + "num_tokens": 57272205.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 118.65625, + "completions/mean_terminated_length": 118.65625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.28444393116988104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.010434578282001894, + "learning_rate": 3.076e-06, + "loss": 0.0004, + "num_tokens": 57301474.0, + "reward": 3.930372714996338, + "reward_std": 0.39387190341949463, + "rewards/reward_fn/mean": 3.930372714996338, + "rewards/reward_fn/std": 0.393871933221817, + "step": 2463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 117.875, + "completions/mean_terminated_length": 117.875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.28455941794664513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.01029497962736059, + "learning_rate": 3.0739999999999997e-06, + "loss": 0.0004, + "num_tokens": 57331326.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 67.28125, + "completions/mean_terminated_length": 67.28125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.2846749047234092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.01039418246364221, + "learning_rate": 3.072e-06, + "loss": 0.0004, + "num_tokens": 57358535.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 261.71875, + "completions/mean_terminated_length": 261.71875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.2847903915001732, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.009583460399881005, + "learning_rate": 3.07e-06, + "loss": 0.0004, + "num_tokens": 57385246.0, + "reward": 3.9058783054351807, + "reward_std": 0.3017742335796356, + "rewards/reward_fn/mean": 3.9058783054351807, + "rewards/reward_fn/std": 0.3017742335796356, + "step": 2466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 161.46875, + "completions/mean_terminated_length": 161.46875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.2849058782769373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.010744250459538307, + "learning_rate": 3.068e-06, + "loss": 0.0004, + "num_tokens": 57406285.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 92.9375, + "completions/mean_terminated_length": 92.9375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.28502136505370135, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.006138697561254958, + "learning_rate": 3.0659999999999995e-06, + "loss": 0.0002, + "num_tokens": 57432043.0, + "reward": 2.821702003479004, + "reward_std": 0.05878188833594322, + "rewards/reward_fn/mean": 2.821702003479004, + "rewards/reward_fn/std": 0.05878187716007233, + "step": 2468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 265.0625, + "completions/mean_terminated_length": 265.0625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.2851368518304654, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.012025904274196364, + "learning_rate": 3.064e-06, + "loss": 0.0005, + "num_tokens": 57452429.0, + "reward": 3.931211233139038, + "reward_std": 0.3891283869743347, + "rewards/reward_fn/mean": 3.931211233139038, + "rewards/reward_fn/std": 0.38912832736968994, + "step": 2469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 144.5625, + "completions/mean_terminated_length": 144.5625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.2852523386072295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.010818376511451788, + "learning_rate": 3.0619999999999997e-06, + "loss": 0.0004, + "num_tokens": 57475999.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 83.4375, + "completions/mean_terminated_length": 83.4375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.2853678253839935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.010268387959513348, + "learning_rate": 3.06e-06, + "loss": 0.0004, + "num_tokens": 57490637.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 193.78125, + "completions/mean_terminated_length": 193.78125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.2854833121607576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.009986670091166161, + "learning_rate": 3.0579999999999998e-06, + "loss": 0.0004, + "num_tokens": 57509158.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 202.71875, + "completions/mean_terminated_length": 202.71875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.28559879893752166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.010212605659035034, + "learning_rate": 3.056e-06, + "loss": 0.0004, + "num_tokens": 57527997.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 69.9375, + "completions/mean_terminated_length": 69.9375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.2857142857142857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.01413862498884555, + "learning_rate": 3.054e-06, + "loss": 0.0006, + "num_tokens": 57556379.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 423.09375, + "completions/mean_terminated_length": 423.09375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.2858297724910498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.011880174366524443, + "learning_rate": 3.0519999999999997e-06, + "loss": 0.0005, + "num_tokens": 57586078.0, + "reward": 3.859053134918213, + "reward_std": 0.5547800660133362, + "rewards/reward_fn/mean": 3.859053134918213, + "rewards/reward_fn/std": 0.5547800660133362, + "step": 2475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 140.78125, + "completions/mean_terminated_length": 140.78125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.28594525926781383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.01140660117380321, + "learning_rate": 3.0499999999999996e-06, + "loss": 0.0005, + "num_tokens": 57603415.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 180.65625, + "completions/mean_terminated_length": 180.65625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.2860607460445779, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "kl": 0.015990760977729224, + "learning_rate": 3.048e-06, + "loss": 0.0006, + "num_tokens": 57633004.0, + "reward": 3.948516607284546, + "reward_std": 0.20275697112083435, + "rewards/reward_fn/mean": 3.948516607284546, + "rewards/reward_fn/std": 0.20275695621967316, + "step": 2477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 93.78125, + "completions/mean_terminated_length": 93.78125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.28617623282134197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.012750337009492796, + "learning_rate": 3.0459999999999997e-06, + "loss": 0.0005, + "num_tokens": 57654405.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 140.59375, + "completions/mean_terminated_length": 140.59375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.286291719598106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.013194800543715246, + "learning_rate": 3.044e-06, + "loss": 0.0005, + "num_tokens": 57676760.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 477.9375, + "completions/mean_terminated_length": 477.9375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.2864072063748701, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0308837890625, + "kl": 0.008207883452996612, + "learning_rate": 3.042e-06, + "loss": 0.0003, + "num_tokens": 57701878.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 436.90625, + "completions/mean_terminated_length": 436.90625, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.28652269315163414, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.008786721373326145, + "learning_rate": 3.0399999999999997e-06, + "loss": 0.0004, + "num_tokens": 57729203.0, + "reward": 3.8514251708984375, + "reward_std": 0.5010020136833191, + "rewards/reward_fn/mean": 3.8514251708984375, + "rewards/reward_fn/std": 0.5010020136833191, + "step": 2481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 241.5, + "completions/mean_terminated_length": 241.5, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.2866381799283982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04736328125, + "kl": 0.008205694211937953, + "learning_rate": 3.0379999999999995e-06, + "loss": 0.0003, + "num_tokens": 57752323.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 90.90625, + "completions/mean_terminated_length": 90.90625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.2867536667051623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037353515625, + "kl": 0.0030665200802104664, + "learning_rate": 3.036e-06, + "loss": 0.0001, + "num_tokens": 57770240.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 171.625, + "completions/mean_terminated_length": 171.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2868691534819263, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.014228525004000403, + "learning_rate": 3.0339999999999997e-06, + "loss": 0.0006, + "num_tokens": 57792276.0, + "reward": 3.8720741271972656, + "reward_std": 0.4451840817928314, + "rewards/reward_fn/mean": 3.8720741271972656, + "rewards/reward_fn/std": 0.44518402218818665, + "step": 2484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 230.5625, + "completions/mean_terminated_length": 230.5625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.28698464025869036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.017107362669776194, + "learning_rate": 3.032e-06, + "loss": 0.0007, + "num_tokens": 57811814.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 82.40625, + "completions/mean_terminated_length": 82.40625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.28710012703545446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.008243526775913779, + "learning_rate": 3.03e-06, + "loss": 0.0003, + "num_tokens": 57825523.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 103.71875, + "completions/mean_terminated_length": 103.71875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.2872156138122185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.007372692467470188, + "learning_rate": 3.028e-06, + "loss": 0.0003, + "num_tokens": 57841578.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 241.84375, + "completions/mean_terminated_length": 241.84375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.28733110058898254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.013398765368037857, + "learning_rate": 3.0259999999999995e-06, + "loss": 0.0005, + "num_tokens": 57865413.0, + "reward": 3.763939380645752, + "reward_std": 0.3466610014438629, + "rewards/reward_fn/mean": 3.763939380645752, + "rewards/reward_fn/std": 0.34666091203689575, + "step": 2488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 150.75, + "completions/mean_terminated_length": 150.75, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.28744658736574663, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.008318355488881934, + "learning_rate": 3.0239999999999998e-06, + "loss": 0.0003, + "num_tokens": 57888413.0, + "reward": 3.8608241081237793, + "reward_std": 0.547651469707489, + "rewards/reward_fn/mean": 3.8608241081237793, + "rewards/reward_fn/std": 0.547651469707489, + "step": 2489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 118.625, + "completions/mean_terminated_length": 118.625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.28756207414251067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.009461664405534975, + "learning_rate": 3.0219999999999996e-06, + "loss": 0.0004, + "num_tokens": 57903729.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 76.15625, + "completions/mean_terminated_length": 76.15625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.28767756091927477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.007841003278372227, + "learning_rate": 3.02e-06, + "loss": 0.0003, + "num_tokens": 57924918.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 92.5625, + "completions/mean_terminated_length": 92.5625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.2877930476960388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.005851506840372167, + "learning_rate": 3.0179999999999997e-06, + "loss": 0.0002, + "num_tokens": 57939688.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 134.625, + "completions/mean_terminated_length": 134.625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.28790853447280285, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.008415460106334649, + "learning_rate": 3.016e-06, + "loss": 0.0003, + "num_tokens": 57953788.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 190.5625, + "completions/mean_terminated_length": 190.5625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.28802402124956694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.010024899289419409, + "learning_rate": 3.014e-06, + "loss": 0.0004, + "num_tokens": 57971534.0, + "reward": 3.9286231994628906, + "reward_std": 0.40376749634742737, + "rewards/reward_fn/mean": 3.9286231994628906, + "rewards/reward_fn/std": 0.40376749634742737, + "step": 2494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 410.9375, + "completions/mean_terminated_length": 410.9375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.288139508026331, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.009272248760680668, + "learning_rate": 3.0119999999999997e-06, + "loss": 0.0004, + "num_tokens": 57994316.0, + "reward": 3.9300270080566406, + "reward_std": 0.3958275616168976, + "rewards/reward_fn/mean": 3.9300270080566406, + "rewards/reward_fn/std": 0.3958275020122528, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 100.0, + "completions/mean_terminated_length": 100.0, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.288254994803095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.011315487208776176, + "learning_rate": 3.0099999999999996e-06, + "loss": 0.0005, + "num_tokens": 58015276.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 177.71875, + "completions/mean_terminated_length": 177.71875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.2883704815798591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.006940738981938921, + "learning_rate": 3.008e-06, + "loss": 0.0003, + "num_tokens": 58033059.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 145.96875, + "completions/mean_terminated_length": 145.96875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.28848596835662316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.013784174545435235, + "learning_rate": 3.0059999999999997e-06, + "loss": 0.0006, + "num_tokens": 58058210.0, + "reward": 3.9869213104248047, + "reward_std": 0.07398424297571182, + "rewards/reward_fn/mean": 3.9869213104248047, + "rewards/reward_fn/std": 0.07398423552513123, + "step": 2498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 105.0, + "completions/mean_terminated_length": 105.0, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.28860145513338725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.013428337348159403, + "learning_rate": 3.004e-06, + "loss": 0.0005, + "num_tokens": 58080354.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 401.75, + "completions/mean_terminated_length": 348.6451416015625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.2887169419101513, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.010777211642562179, + "learning_rate": 3.002e-06, + "loss": 0.0004, + "num_tokens": 58105946.0, + "reward": 3.3979644775390625, + "reward_std": 1.1169192790985107, + "rewards/reward_fn/mean": 3.3979644775390625, + "rewards/reward_fn/std": 1.1169192790985107, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 138.46875, + "completions/mean_terminated_length": 138.46875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.28883242868691533, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.0222467269632034, + "learning_rate": 3e-06, + "loss": 0.0009, + "num_tokens": 58132009.0, + "reward": 3.948936939239502, + "reward_std": 0.2009439319372177, + "rewards/reward_fn/mean": 3.948936939239502, + "rewards/reward_fn/std": 0.20094391703605652, + "step": 2501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 192.78125, + "completions/mean_terminated_length": 192.78125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.28894791546367943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.012197042160551064, + "learning_rate": 2.998e-06, + "loss": 0.0005, + "num_tokens": 58151906.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 192.375, + "completions/mean_terminated_length": 192.375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.28906340224044347, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.009540440012642648, + "learning_rate": 2.9959999999999998e-06, + "loss": 0.0004, + "num_tokens": 58170094.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 200.40625, + "completions/mean_terminated_length": 200.40625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.2891788890172075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.008752748210099526, + "learning_rate": 2.994e-06, + "loss": 0.0004, + "num_tokens": 58190427.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 364.34375, + "completions/mean_terminated_length": 364.34375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.2892943757939716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.010502119272132404, + "learning_rate": 2.992e-06, + "loss": 0.0004, + "num_tokens": 58221766.0, + "reward": 3.6019821166992188, + "reward_std": 0.5260512232780457, + "rewards/reward_fn/mean": 3.6019821166992188, + "rewards/reward_fn/std": 0.5260512232780457, + "step": 2505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 228.5, + "completions/mean_terminated_length": 228.5, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.28940986257073564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.011873100243974477, + "learning_rate": 2.99e-06, + "loss": 0.0005, + "num_tokens": 58242870.0, + "reward": 3.92767596244812, + "reward_std": 0.40912649035453796, + "rewards/reward_fn/mean": 3.92767596244812, + "rewards/reward_fn/std": 0.4091264009475708, + "step": 2506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 216.40625, + "completions/mean_terminated_length": 216.40625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.28952534934749974, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.009769160213181749, + "learning_rate": 2.988e-06, + "loss": 0.0004, + "num_tokens": 58276899.0, + "reward": 3.1834301948547363, + "reward_std": 0.09516559541225433, + "rewards/reward_fn/mean": 3.1834301948547363, + "rewards/reward_fn/std": 0.09516557306051254, + "step": 2507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 283.8125, + "completions/mean_terminated_length": 283.8125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.2896408361242638, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.01045547415560577, + "learning_rate": 2.986e-06, + "loss": 0.0004, + "num_tokens": 58301053.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 62.21875, + "completions/mean_terminated_length": 62.21875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.2897563229010278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458984375, + "kl": 0.003687883085149224, + "learning_rate": 2.9839999999999997e-06, + "loss": 0.0001, + "num_tokens": 58321988.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 142.84375, + "completions/mean_terminated_length": 142.84375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.2898718096777919, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.036200532340444624, + "learning_rate": 2.982e-06, + "loss": 0.0014, + "num_tokens": 58348607.0, + "reward": 3.7020206451416016, + "reward_std": 0.43131545186042786, + "rewards/reward_fn/mean": 3.7020206451416016, + "rewards/reward_fn/std": 0.43131545186042786, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 303.9375, + "completions/mean_terminated_length": 303.9375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.28998729645455595, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0107446369365789, + "learning_rate": 2.98e-06, + "loss": 0.0004, + "num_tokens": 58371037.0, + "reward": 3.7884817123413086, + "reward_std": 0.448696106672287, + "rewards/reward_fn/mean": 3.7884817123413086, + "rewards/reward_fn/std": 0.4486960768699646, + "step": 2511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 81.25, + "completions/mean_terminated_length": 81.25, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.29010278323132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.010162534956180025, + "learning_rate": 2.978e-06, + "loss": 0.0004, + "num_tokens": 58387173.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 122.0, + "completions/mean_terminated_length": 122.0, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.2902182700080841, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.023748793071717955, + "learning_rate": 2.976e-06, + "loss": 0.0009, + "num_tokens": 58411781.0, + "reward": 3.946728229522705, + "reward_std": 0.16846361756324768, + "rewards/reward_fn/mean": 3.946728229522705, + "rewards/reward_fn/std": 0.16846361756324768, + "step": 2513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 121.125, + "completions/mean_terminated_length": 121.125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.29033375678484813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.014819601783528924, + "learning_rate": 2.9740000000000002e-06, + "loss": 0.0006, + "num_tokens": 58437641.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 215.03125, + "completions/mean_terminated_length": 215.03125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.29044924356161217, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.009053867157490458, + "learning_rate": 2.9719999999999997e-06, + "loss": 0.0004, + "num_tokens": 58462890.0, + "reward": 3.0780692100524902, + "reward_std": 0.35558614134788513, + "rewards/reward_fn/mean": 3.0780692100524902, + "rewards/reward_fn/std": 0.3555861711502075, + "step": 2515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 176.5625, + "completions/mean_terminated_length": 176.5625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.29056473033837626, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.027597895212238654, + "learning_rate": 2.97e-06, + "loss": 0.0011, + "num_tokens": 58490684.0, + "reward": 3.942986249923706, + "reward_std": 0.2267826646566391, + "rewards/reward_fn/mean": 3.942986249923706, + "rewards/reward_fn/std": 0.2267826646566391, + "step": 2516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 261.3125, + "completions/mean_terminated_length": 261.3125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.2906802171151403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.013820055333781056, + "learning_rate": 2.968e-06, + "loss": 0.0006, + "num_tokens": 58510246.0, + "reward": 3.9055821895599365, + "reward_std": 0.3087639808654785, + "rewards/reward_fn/mean": 3.9055821895599365, + "rewards/reward_fn/std": 0.30876395106315613, + "step": 2517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 219.15625, + "completions/mean_terminated_length": 219.15625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.2907957038919044, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.019431764114415273, + "learning_rate": 2.966e-06, + "loss": 0.0008, + "num_tokens": 58537931.0, + "reward": 3.941634178161621, + "reward_std": 0.22967679798603058, + "rewards/reward_fn/mean": 3.941634178161621, + "rewards/reward_fn/std": 0.22967679798603058, + "step": 2518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 70.5, + "completions/mean_terminated_length": 70.5, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.29091119066866844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.004112694339710288, + "learning_rate": 2.964e-06, + "loss": 0.0002, + "num_tokens": 58551579.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 294.53125, + "completions/mean_terminated_length": 294.53125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.2910266774454325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036865234375, + "kl": 0.006612831501115579, + "learning_rate": 2.962e-06, + "loss": 0.0003, + "num_tokens": 58576044.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 194.46875, + "completions/mean_terminated_length": 194.46875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.2911421642221966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.011925474740564823, + "learning_rate": 2.96e-06, + "loss": 0.0005, + "num_tokens": 58601371.0, + "reward": 3.5687003135681152, + "reward_std": 0.6223995685577393, + "rewards/reward_fn/mean": 3.5687003135681152, + "rewards/reward_fn/std": 0.6223995089530945, + "step": 2521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 137.71875, + "completions/mean_terminated_length": 137.71875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.2912576509989606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.007962800002133008, + "learning_rate": 2.958e-06, + "loss": 0.0003, + "num_tokens": 58618962.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 146.1875, + "completions/mean_terminated_length": 146.1875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.29137313777572466, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.009212083146849182, + "learning_rate": 2.9559999999999997e-06, + "loss": 0.0004, + "num_tokens": 58645848.0, + "reward": 3.9412474632263184, + "reward_std": 0.2318696528673172, + "rewards/reward_fn/mean": 3.9412474632263184, + "rewards/reward_fn/std": 0.2318696677684784, + "step": 2523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 307.09375, + "completions/mean_terminated_length": 307.09375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.29148862455248875, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.011747406591894105, + "learning_rate": 2.954e-06, + "loss": 0.0005, + "num_tokens": 58682139.0, + "reward": 3.7724480628967285, + "reward_std": 0.43823057413101196, + "rewards/reward_fn/mean": 3.7724480628967285, + "rewards/reward_fn/std": 0.43823060393333435, + "step": 2524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 100.5625, + "completions/mean_terminated_length": 100.5625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.2916041113292528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.012115366691432428, + "learning_rate": 2.952e-06, + "loss": 0.0005, + "num_tokens": 58701837.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 66.21875, + "completions/mean_terminated_length": 66.21875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.2917195981060169, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.0075450487493071705, + "learning_rate": 2.95e-06, + "loss": 0.0003, + "num_tokens": 58728148.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 225.875, + "completions/mean_terminated_length": 225.875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.2918350848827809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.008032280624320265, + "learning_rate": 2.948e-06, + "loss": 0.0003, + "num_tokens": 58749520.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 154.8125, + "completions/mean_terminated_length": 154.8125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.29195057165954497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.009048082676599734, + "learning_rate": 2.946e-06, + "loss": 0.0004, + "num_tokens": 58775146.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 82.3125, + "completions/mean_terminated_length": 82.3125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.29206605843630906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.007457668300048681, + "learning_rate": 2.9439999999999997e-06, + "loss": 0.0003, + "num_tokens": 58800116.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 110.59375, + "completions/mean_terminated_length": 110.59375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.2921815452130731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.011647103819996119, + "learning_rate": 2.942e-06, + "loss": 0.0005, + "num_tokens": 58822663.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 229.59375, + "completions/mean_terminated_length": 229.59375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.29229703198983714, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.012666328220802825, + "learning_rate": 2.94e-06, + "loss": 0.0005, + "num_tokens": 58846970.0, + "reward": 3.972994565963745, + "reward_std": 0.15276546776294708, + "rewards/reward_fn/mean": 3.972994565963745, + "rewards/reward_fn/std": 0.15276546776294708, + "step": 2531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 128.40625, + "completions/mean_terminated_length": 128.40625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.29241251876660124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.017579268846020568, + "learning_rate": 2.938e-06, + "loss": 0.0007, + "num_tokens": 58867431.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 176.65625, + "completions/mean_terminated_length": 176.65625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.2925280055433653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.007859211269533262, + "learning_rate": 2.936e-06, + "loss": 0.0003, + "num_tokens": 58894588.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 298.28125, + "completions/mean_terminated_length": 298.28125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.2926434923201294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.016175679746083915, + "learning_rate": 2.934e-06, + "loss": 0.0006, + "num_tokens": 58924165.0, + "reward": 3.784782886505127, + "reward_std": 0.6272717714309692, + "rewards/reward_fn/mean": 3.784782886505127, + "rewards/reward_fn/std": 0.6272717118263245, + "step": 2534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 97.6875, + "completions/mean_terminated_length": 97.6875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.2927589790968934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.011405711629777215, + "learning_rate": 2.9319999999999996e-06, + "loss": 0.0005, + "num_tokens": 58939643.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 400.96875, + "completions/mean_terminated_length": 347.83868408203125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.29287446587365745, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.011558935206267051, + "learning_rate": 2.93e-06, + "loss": 0.0005, + "num_tokens": 58964858.0, + "reward": 3.75, + "reward_std": 0.9837387204170227, + "rewards/reward_fn/mean": 3.75, + "rewards/reward_fn/std": 0.9837387204170227, + "step": 2536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 71.4375, + "completions/mean_terminated_length": 71.4375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.29298995265042155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048583984375, + "kl": 0.004267784735930036, + "learning_rate": 2.9279999999999997e-06, + "loss": 0.0002, + "num_tokens": 58989032.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 124.0, + "completions/mean_terminated_length": 124.0, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.2931054394271856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.011279146099695936, + "learning_rate": 2.926e-06, + "loss": 0.0005, + "num_tokens": 59011880.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.29322092620394963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.012478736622142605, + "learning_rate": 2.924e-06, + "loss": 0.0005, + "num_tokens": 59043251.0, + "reward": 3.252739429473877, + "reward_std": 0.695571780204773, + "rewards/reward_fn/mean": 3.252739429473877, + "rewards/reward_fn/std": 0.695571780204773, + "step": 2539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 184.5, + "completions/mean_terminated_length": 184.5, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.2933364129807137, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.012601358743268065, + "learning_rate": 2.922e-06, + "loss": 0.0005, + "num_tokens": 59065795.0, + "reward": 3.3922574520111084, + "reward_std": 0.36325016617774963, + "rewards/reward_fn/mean": 3.3922574520111084, + "rewards/reward_fn/std": 0.36325013637542725, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 141.625, + "completions/mean_terminated_length": 141.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.29345189975747776, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.005498331087437691, + "learning_rate": 2.92e-06, + "loss": 0.0002, + "num_tokens": 59095031.0, + "reward": 3.9633543491363525, + "reward_std": 0.20729875564575195, + "rewards/reward_fn/mean": 3.9633543491363525, + "rewards/reward_fn/std": 0.20729871094226837, + "step": 2541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 96.46875, + "completions/mean_terminated_length": 96.46875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.2935673865342418, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.40625, + "kl": 0.009076198533875868, + "learning_rate": 2.918e-06, + "loss": 0.0004, + "num_tokens": 59118790.0, + "reward": 3.9415204524993896, + "reward_std": 0.23118925094604492, + "rewards/reward_fn/mean": 3.9415204524993896, + "rewards/reward_fn/std": 0.23118923604488373, + "step": 2542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 90.9375, + "completions/mean_terminated_length": 90.9375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.2936828733110059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.00739404046908021, + "learning_rate": 2.9159999999999997e-06, + "loss": 0.0003, + "num_tokens": 59147556.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 88.625, + "completions/mean_terminated_length": 88.625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.29379836008776994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.008945084897277411, + "learning_rate": 2.914e-06, + "loss": 0.0004, + "num_tokens": 59172280.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 187.9375, + "completions/mean_terminated_length": 187.9375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.29391384686453403, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.008447494117717724, + "learning_rate": 2.912e-06, + "loss": 0.0003, + "num_tokens": 59200278.0, + "reward": 3.9649910926818848, + "reward_std": 0.19804063439369202, + "rewards/reward_fn/mean": 3.9649910926818848, + "rewards/reward_fn/std": 0.1980406641960144, + "step": 2545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 292.0, + "completions/mean_terminated_length": 292.0, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.2940293336412981, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.01084939725114964, + "learning_rate": 2.91e-06, + "loss": 0.0004, + "num_tokens": 59232182.0, + "reward": 3.9311892986297607, + "reward_std": 0.3892524242401123, + "rewards/reward_fn/mean": 3.9311892986297607, + "rewards/reward_fn/std": 0.3892523944377899, + "step": 2546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 178.53125, + "completions/mean_terminated_length": 178.53125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.2941448204180621, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.010769974353024736, + "learning_rate": 2.908e-06, + "loss": 0.0004, + "num_tokens": 59251623.0, + "reward": 3.120077610015869, + "reward_std": 0.31998950242996216, + "rewards/reward_fn/mean": 3.120077610015869, + "rewards/reward_fn/std": 0.31998953223228455, + "step": 2547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 255.0, + "completions/mean_terminated_length": 255.0, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.2942603071948262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.012702829859335907, + "learning_rate": 2.9060000000000002e-06, + "loss": 0.0005, + "num_tokens": 59272583.0, + "reward": 3.6511693000793457, + "reward_std": 0.8236427307128906, + "rewards/reward_fn/mean": 3.6511693000793457, + "rewards/reward_fn/std": 0.8236426711082458, + "step": 2548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 79.4375, + "completions/mean_terminated_length": 79.4375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.29437579397159025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037109375, + "kl": 0.003188599149325455, + "learning_rate": 2.9039999999999996e-06, + "loss": 0.0001, + "num_tokens": 59285909.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 150.46875, + "completions/mean_terminated_length": 150.46875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.2944912807483543, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.012686249374382896, + "learning_rate": 2.902e-06, + "loss": 0.0005, + "num_tokens": 59307908.0, + "reward": 3.7765092849731445, + "reward_std": 0.3183625638484955, + "rewards/reward_fn/mean": 3.7765092849731445, + "rewards/reward_fn/std": 0.3183625042438507, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 416.21875, + "completions/mean_terminated_length": 416.21875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.2946067675251184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.014866999161313288, + "learning_rate": 2.8999999999999998e-06, + "loss": 0.0006, + "num_tokens": 59345067.0, + "reward": 2.733369827270508, + "reward_std": 0.35302162170410156, + "rewards/reward_fn/mean": 2.733369827270508, + "rewards/reward_fn/std": 0.35302162170410156, + "step": 2551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 234.75, + "completions/mean_terminated_length": 234.75, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.2947222543018824, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.006912110075063538, + "learning_rate": 2.898e-06, + "loss": 0.0003, + "num_tokens": 59375043.0, + "reward": 3.0925512313842773, + "reward_std": 0.3986169993877411, + "rewards/reward_fn/mean": 3.0925512313842773, + "rewards/reward_fn/std": 0.3986169397830963, + "step": 2552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 325.78125, + "completions/mean_terminated_length": 325.78125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.2948377410786465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033935546875, + "kl": 0.00796803869161522, + "learning_rate": 2.896e-06, + "loss": 0.0003, + "num_tokens": 59401596.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 221.59375, + "completions/mean_terminated_length": 221.59375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.29495322785541056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.009664215082011651, + "learning_rate": 2.894e-06, + "loss": 0.0004, + "num_tokens": 59433711.0, + "reward": 3.419668674468994, + "reward_std": 0.1444733440876007, + "rewards/reward_fn/mean": 3.419668674468994, + "rewards/reward_fn/std": 0.1444733738899231, + "step": 2554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 88.84375, + "completions/mean_terminated_length": 88.84375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.2950687146321746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.01247042491013417, + "learning_rate": 2.8919999999999996e-06, + "loss": 0.0005, + "num_tokens": 59463722.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 152.0625, + "completions/mean_terminated_length": 152.0625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.2951842014089387, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.010925624403171241, + "learning_rate": 2.89e-06, + "loss": 0.0004, + "num_tokens": 59478284.0, + "reward": 3.926102638244629, + "reward_std": 0.41802677512168884, + "rewards/reward_fn/mean": 3.926102638244629, + "rewards/reward_fn/std": 0.41802680492401123, + "step": 2556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1703.0, + "completions/max_terminated_length": 1703.0, + "completions/mean_length": 266.6875, + "completions/mean_terminated_length": 266.6875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.29529968818570274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.012484280596254393, + "learning_rate": 2.8879999999999997e-06, + "loss": 0.0005, + "num_tokens": 59499426.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 139.03125, + "completions/mean_terminated_length": 139.03125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.2954151749624668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.0079768424257054, + "learning_rate": 2.886e-06, + "loss": 0.0003, + "num_tokens": 59519075.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 132.96875, + "completions/mean_terminated_length": 132.96875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.29553066173923087, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.00942258449504152, + "learning_rate": 2.884e-06, + "loss": 0.0004, + "num_tokens": 59539042.0, + "reward": 3.956655502319336, + "reward_std": 0.17059145867824554, + "rewards/reward_fn/mean": 3.956655502319336, + "rewards/reward_fn/std": 0.17059147357940674, + "step": 2559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 137.78125, + "completions/mean_terminated_length": 137.78125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.2956461485159949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.008994900988909649, + "learning_rate": 2.882e-06, + "loss": 0.0004, + "num_tokens": 59555003.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 155.75, + "completions/mean_terminated_length": 155.75, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.295761635292759, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.008549976475478616, + "learning_rate": 2.88e-06, + "loss": 0.0003, + "num_tokens": 59582483.0, + "reward": 3.725135326385498, + "reward_std": 0.7390026450157166, + "rewards/reward_fn/mean": 3.725135326385498, + "rewards/reward_fn/std": 0.7390025854110718, + "step": 2561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 216.4375, + "completions/mean_terminated_length": 216.4375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.29587712206952305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.015398211209685542, + "learning_rate": 2.878e-06, + "loss": 0.0006, + "num_tokens": 59601889.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 178.5625, + "completions/mean_terminated_length": 178.5625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.2959926088462871, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.016781263722805306, + "learning_rate": 2.8759999999999997e-06, + "loss": 0.0007, + "num_tokens": 59624595.0, + "reward": 3.2818498611450195, + "reward_std": 0.2174265831708908, + "rewards/reward_fn/mean": 3.2818498611450195, + "rewards/reward_fn/std": 0.2174265831708908, + "step": 2563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 211.46875, + "completions/mean_terminated_length": 211.46875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.2961080956230512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.024089346232358366, + "learning_rate": 2.874e-06, + "loss": 0.001, + "num_tokens": 59650274.0, + "reward": 3.0363926887512207, + "reward_std": 0.06049569696187973, + "rewards/reward_fn/mean": 3.0363926887512207, + "rewards/reward_fn/std": 0.060495708137750626, + "step": 2564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 154.90625, + "completions/mean_terminated_length": 154.90625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.2962235823998152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.008311561068694573, + "learning_rate": 2.8719999999999998e-06, + "loss": 0.0003, + "num_tokens": 59665695.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 186.21875, + "completions/mean_terminated_length": 186.21875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.29633906917657926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.009580408514011651, + "learning_rate": 2.87e-06, + "loss": 0.0004, + "num_tokens": 59694310.0, + "reward": 3.8635787963867188, + "reward_std": 0.3300170600414276, + "rewards/reward_fn/mean": 3.8635787963867188, + "rewards/reward_fn/std": 0.3300170600414276, + "step": 2566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 161.09375, + "completions/mean_terminated_length": 161.09375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.29645455595334336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.008301210982608609, + "learning_rate": 2.868e-06, + "loss": 0.0003, + "num_tokens": 59721961.0, + "reward": 3.8619987964630127, + "reward_std": 0.5430428981781006, + "rewards/reward_fn/mean": 3.8619987964630127, + "rewards/reward_fn/std": 0.5430429577827454, + "step": 2567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 105.28125, + "completions/mean_terminated_length": 105.28125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.2965700427301074, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "kl": 0.007343840730754891, + "learning_rate": 2.866e-06, + "loss": 0.0003, + "num_tokens": 59744242.0, + "reward": 3.4788942337036133, + "reward_std": 0.08201045542955399, + "rewards/reward_fn/mean": 3.4788942337036133, + "rewards/reward_fn/std": 0.08201045542955399, + "step": 2568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 96.6875, + "completions/mean_terminated_length": 96.6875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.29668552950687144, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.009941190786776133, + "learning_rate": 2.8639999999999996e-06, + "loss": 0.0004, + "num_tokens": 59759496.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 67.65625, + "completions/mean_terminated_length": 67.65625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.29680101628363553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.158203125, + "kl": 0.01089432809385471, + "learning_rate": 2.862e-06, + "loss": 0.0004, + "num_tokens": 59780285.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 214.0, + "completions/mean_terminated_length": 214.0, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2969165030603996, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.010215356029220857, + "learning_rate": 2.8599999999999997e-06, + "loss": 0.0004, + "num_tokens": 59796093.0, + "reward": 3.2839601039886475, + "reward_std": 0.23517665266990662, + "rewards/reward_fn/mean": 3.2839601039886475, + "rewards/reward_fn/std": 0.23517662286758423, + "step": 2571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 174.40625, + "completions/mean_terminated_length": 174.40625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.29703198983716367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.012016253065667115, + "learning_rate": 2.858e-06, + "loss": 0.0005, + "num_tokens": 59824906.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 305.21875, + "completions/mean_terminated_length": 305.21875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.2971474766139277, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.01082003781630192, + "learning_rate": 2.856e-06, + "loss": 0.0004, + "num_tokens": 59842673.0, + "reward": 3.716364860534668, + "reward_std": 0.7626131176948547, + "rewards/reward_fn/mean": 3.716364860534668, + "rewards/reward_fn/std": 0.7626131176948547, + "step": 2573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 254.71875, + "completions/mean_terminated_length": 254.71875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.29726296339069175, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.010176020507060457, + "learning_rate": 2.854e-06, + "loss": 0.0004, + "num_tokens": 59873000.0, + "reward": 3.4902005195617676, + "reward_std": 0.458444744348526, + "rewards/reward_fn/mean": 3.4902005195617676, + "rewards/reward_fn/std": 0.458444744348526, + "step": 2574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 70.375, + "completions/mean_terminated_length": 70.375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.29737845016745584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.0062785795162199065, + "learning_rate": 2.8519999999999995e-06, + "loss": 0.0003, + "num_tokens": 59886644.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 385.21875, + "completions/mean_terminated_length": 385.21875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.2974939369442199, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.01122940395725891, + "learning_rate": 2.85e-06, + "loss": 0.0004, + "num_tokens": 59921307.0, + "reward": 3.928220510482788, + "reward_std": 0.4060462415218353, + "rewards/reward_fn/mean": 3.928220510482788, + "rewards/reward_fn/std": 0.4060462415218353, + "step": 2576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 207.375, + "completions/mean_terminated_length": 207.375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.2976094237209839, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.015034727955935523, + "learning_rate": 2.8479999999999997e-06, + "loss": 0.0006, + "num_tokens": 59944679.0, + "reward": 3.2254738807678223, + "reward_std": 0.2951607406139374, + "rewards/reward_fn/mean": 3.2254738807678223, + "rewards/reward_fn/std": 0.2951607406139374, + "step": 2577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 177.53125, + "completions/mean_terminated_length": 177.53125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.297724910497748, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.010079980012960732, + "learning_rate": 2.846e-06, + "loss": 0.0004, + "num_tokens": 59969784.0, + "reward": 3.8968615531921387, + "reward_std": 0.2780374586582184, + "rewards/reward_fn/mean": 3.8968615531921387, + "rewards/reward_fn/std": 0.2780374586582184, + "step": 2578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 294.75, + "completions/mean_terminated_length": 294.75, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.29784039727451206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03515625, + "kl": 0.006971653128857724, + "learning_rate": 2.844e-06, + "loss": 0.0003, + "num_tokens": 59993840.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 147.21875, + "completions/mean_terminated_length": 147.21875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.29795588405127615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.011462252412457019, + "learning_rate": 2.842e-06, + "loss": 0.0005, + "num_tokens": 60016727.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 62.65625, + "completions/mean_terminated_length": 62.65625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.2980713708280402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1669921875, + "kl": 0.010701221675844863, + "learning_rate": 2.84e-06, + "loss": 0.0004, + "num_tokens": 60039756.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 65.0, + "completions/max_terminated_length": 65.0, + "completions/mean_length": 52.40625, + "completions/mean_terminated_length": 52.40625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.29818685760480423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1669921875, + "kl": 0.010198749645496719, + "learning_rate": 2.8379999999999998e-06, + "loss": 0.0004, + "num_tokens": 60058617.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 252.75, + "completions/mean_terminated_length": 252.75, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.29830234438156833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042236328125, + "kl": 0.008004461891687242, + "learning_rate": 2.8359999999999996e-06, + "loss": 0.0003, + "num_tokens": 60082897.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 274.0625, + "completions/mean_terminated_length": 274.0625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.29841783115833237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.012789981657988392, + "learning_rate": 2.834e-06, + "loss": 0.0005, + "num_tokens": 60105587.0, + "reward": 2.9122560024261475, + "reward_std": 0.2361413538455963, + "rewards/reward_fn/mean": 2.9122560024261475, + "rewards/reward_fn/std": 0.23614132404327393, + "step": 2584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 427.15625, + "completions/mean_terminated_length": 427.15625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.2985333179350964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042236328125, + "kl": 0.009053947658685502, + "learning_rate": 2.8319999999999997e-06, + "loss": 0.0004, + "num_tokens": 60129112.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 255.59375, + "completions/mean_terminated_length": 255.59375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.2986488047118605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296875, + "kl": 0.007832185619918164, + "learning_rate": 2.83e-06, + "loss": 0.0003, + "num_tokens": 60152395.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 235.25, + "completions/mean_terminated_length": 235.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.29876429148862454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0400390625, + "kl": 0.005091296916361898, + "learning_rate": 2.828e-06, + "loss": 0.0002, + "num_tokens": 60184051.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 63.96875, + "completions/mean_terminated_length": 63.96875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.29887977826538864, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0625, + "kl": 0.009258345045964234, + "learning_rate": 2.826e-06, + "loss": 0.0004, + "num_tokens": 60206482.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 87.1875, + "completions/mean_terminated_length": 87.1875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.2989952650421527, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.005979772562568542, + "learning_rate": 2.8239999999999996e-06, + "loss": 0.0002, + "num_tokens": 60222776.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 328.5625, + "completions/mean_terminated_length": 328.5625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.2991107518189167, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.024984710878925398, + "learning_rate": 2.822e-06, + "loss": 0.001, + "num_tokens": 60256234.0, + "reward": 2.515786647796631, + "reward_std": 0.6797155141830444, + "rewards/reward_fn/mean": 2.515786647796631, + "rewards/reward_fn/std": 0.6797154545783997, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 169.65625, + "completions/mean_terminated_length": 169.65625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.2992262385956808, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.01773313332523685, + "learning_rate": 2.8199999999999997e-06, + "loss": 0.0007, + "num_tokens": 60284767.0, + "reward": 3.7882232666015625, + "reward_std": 0.6689854264259338, + "rewards/reward_fn/mean": 3.7882232666015625, + "rewards/reward_fn/std": 0.6689854264259338, + "step": 2591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 215.34375, + "completions/mean_terminated_length": 215.34375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.29934172537244486, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.009402472453075461, + "learning_rate": 2.818e-06, + "loss": 0.0004, + "num_tokens": 60303242.0, + "reward": 3.9287171363830566, + "reward_std": 0.40323740243911743, + "rewards/reward_fn/mean": 3.9287171363830566, + "rewards/reward_fn/std": 0.40323734283447266, + "step": 2592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 152.25, + "completions/mean_terminated_length": 152.25, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.2994572121492089, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.011566609566216357, + "learning_rate": 2.816e-06, + "loss": 0.0005, + "num_tokens": 60320466.0, + "reward": 3.7078423500061035, + "reward_std": 0.47520437836647034, + "rewards/reward_fn/mean": 3.7078423500061035, + "rewards/reward_fn/std": 0.47520434856414795, + "step": 2593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 174.59375, + "completions/mean_terminated_length": 174.59375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.299572698925973, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.023394439805997536, + "learning_rate": 2.814e-06, + "loss": 0.0009, + "num_tokens": 60346469.0, + "reward": 3.97676420211792, + "reward_std": 0.13144119083881378, + "rewards/reward_fn/mean": 3.97676420211792, + "rewards/reward_fn/std": 0.1314411759376526, + "step": 2594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 176.21875, + "completions/mean_terminated_length": 176.21875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.29968818570273703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.007998974011570681, + "learning_rate": 2.8119999999999995e-06, + "loss": 0.0003, + "num_tokens": 60371244.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 62.1875, + "completions/mean_terminated_length": 62.1875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.29980367247950107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.005721300129152951, + "learning_rate": 2.8099999999999998e-06, + "loss": 0.0002, + "num_tokens": 60391570.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 204.59375, + "completions/mean_terminated_length": 204.59375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.29991915925626517, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.012481640456826426, + "learning_rate": 2.8079999999999996e-06, + "loss": 0.0005, + "num_tokens": 60417701.0, + "reward": 3.9728758335113525, + "reward_std": 0.1534377932548523, + "rewards/reward_fn/mean": 3.9728758335113525, + "rewards/reward_fn/std": 0.15343782305717468, + "step": 2597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 171.8125, + "completions/mean_terminated_length": 171.8125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.3000346460330292, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.008524002281774301, + "learning_rate": 2.806e-06, + "loss": 0.0003, + "num_tokens": 60442271.0, + "reward": 2.8333582878112793, + "reward_std": 0.3102012574672699, + "rewards/reward_fn/mean": 2.8333582878112793, + "rewards/reward_fn/std": 0.3102012574672699, + "step": 2598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 69.34375, + "completions/mean_terminated_length": 69.34375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.3001501328097933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11376953125, + "kl": 0.007941994059365243, + "learning_rate": 2.8039999999999998e-06, + "loss": 0.0003, + "num_tokens": 60468714.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 76.96875, + "completions/mean_terminated_length": 76.96875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.30026561958655734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0576171875, + "kl": 0.005620444375381339, + "learning_rate": 2.802e-06, + "loss": 0.0002, + "num_tokens": 60494121.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 190.53125, + "completions/mean_terminated_length": 190.53125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.3003811063633214, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.013576488709077239, + "learning_rate": 2.8e-06, + "loss": 0.0005, + "num_tokens": 60516218.0, + "reward": 2.9007561206817627, + "reward_std": 0.04687732830643654, + "rewards/reward_fn/mean": 2.9007561206817627, + "rewards/reward_fn/std": 0.04687735065817833, + "step": 2601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 75.96875, + "completions/mean_terminated_length": 75.96875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3004965931400855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.009936434675182682, + "learning_rate": 2.7979999999999997e-06, + "loss": 0.0004, + "num_tokens": 60528217.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 213.25, + "completions/mean_terminated_length": 213.25, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.3006120799168495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03857421875, + "kl": 0.0074879379717458505, + "learning_rate": 2.7959999999999996e-06, + "loss": 0.0003, + "num_tokens": 60556257.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 181.40625, + "completions/mean_terminated_length": 181.40625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.30072756669361356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.0073105982737615705, + "learning_rate": 2.794e-06, + "loss": 0.0003, + "num_tokens": 60577070.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 92.25, + "completions/mean_terminated_length": 92.25, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.30084305347037765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.007884965794801246, + "learning_rate": 2.7919999999999997e-06, + "loss": 0.0003, + "num_tokens": 60598166.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 114.84375, + "completions/mean_terminated_length": 114.84375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.3009585402471417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.007438539330905769, + "learning_rate": 2.79e-06, + "loss": 0.0003, + "num_tokens": 60623633.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 104.65625, + "completions/mean_terminated_length": 104.65625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.3010740270239058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.007923035704152426, + "learning_rate": 2.788e-06, + "loss": 0.0003, + "num_tokens": 60648134.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 85.15625, + "completions/mean_terminated_length": 85.15625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.30118951380066983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.007028398395050317, + "learning_rate": 2.786e-06, + "loss": 0.0003, + "num_tokens": 60674379.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 223.78125, + "completions/mean_terminated_length": 223.78125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.30130500057743387, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.008812752974336036, + "learning_rate": 2.7839999999999995e-06, + "loss": 0.0004, + "num_tokens": 60702180.0, + "reward": 3.8602242469787598, + "reward_std": 0.5500142574310303, + "rewards/reward_fn/mean": 3.8602242469787598, + "rewards/reward_fn/std": 0.5500141382217407, + "step": 2609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 249.65625, + "completions/mean_terminated_length": 249.65625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.30142048735419796, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.016568460763664916, + "learning_rate": 2.782e-06, + "loss": 0.0007, + "num_tokens": 60728057.0, + "reward": 3.6219420433044434, + "reward_std": 0.4239625036716461, + "rewards/reward_fn/mean": 3.6219420433044434, + "rewards/reward_fn/std": 0.4239625036716461, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 93.5, + "completions/mean_terminated_length": 93.5, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.301535974130962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.008224388660892146, + "learning_rate": 2.7799999999999996e-06, + "loss": 0.0003, + "num_tokens": 60742025.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 179.15625, + "completions/mean_terminated_length": 179.15625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.30165146090772604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.010726025415351614, + "learning_rate": 2.778e-06, + "loss": 0.0004, + "num_tokens": 60759438.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 229.28125, + "completions/mean_terminated_length": 229.28125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.30176694768449014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.009052217174030375, + "learning_rate": 2.7759999999999998e-06, + "loss": 0.0004, + "num_tokens": 60787991.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 147.40625, + "completions/mean_terminated_length": 147.40625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.3018824344612542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.012910848978208378, + "learning_rate": 2.774e-06, + "loss": 0.0005, + "num_tokens": 60809732.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 188.375, + "completions/mean_terminated_length": 188.375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.3019979212380183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.013345777697395533, + "learning_rate": 2.7719999999999995e-06, + "loss": 0.0005, + "num_tokens": 60838608.0, + "reward": 3.6907083988189697, + "reward_std": 0.4676326513290405, + "rewards/reward_fn/mean": 3.6907083988189697, + "rewards/reward_fn/std": 0.46763262152671814, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 222.40625, + "completions/mean_terminated_length": 222.40625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.3021134080147823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.015423024509800598, + "learning_rate": 2.7699999999999997e-06, + "loss": 0.0006, + "num_tokens": 60869917.0, + "reward": 3.649592876434326, + "reward_std": 0.5303497910499573, + "rewards/reward_fn/mean": 3.649592876434326, + "rewards/reward_fn/std": 0.5303497910499573, + "step": 2616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 278.375, + "completions/mean_terminated_length": 278.375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.30222889479154635, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.008858514433086384, + "learning_rate": 2.7679999999999996e-06, + "loss": 0.0004, + "num_tokens": 60889705.0, + "reward": 3.9302282333374023, + "reward_std": 0.3946893513202667, + "rewards/reward_fn/mean": 3.9302282333374023, + "rewards/reward_fn/std": 0.39468929171562195, + "step": 2617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 189.6875, + "completions/mean_terminated_length": 189.6875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.30234438156831045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044189453125, + "kl": 0.008465941362373997, + "learning_rate": 2.766e-06, + "loss": 0.0003, + "num_tokens": 60907711.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 430.375, + "completions/mean_terminated_length": 430.375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.3024598683450745, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.01328413154988084, + "learning_rate": 2.7639999999999997e-06, + "loss": 0.0005, + "num_tokens": 60936555.0, + "reward": 3.648688316345215, + "reward_std": 0.8294490575790405, + "rewards/reward_fn/mean": 3.648688316345215, + "rewards/reward_fn/std": 0.8294490575790405, + "step": 2619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 80.65625, + "completions/mean_terminated_length": 80.65625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.30257535512183853, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "kl": 0.00939896957424935, + "learning_rate": 2.762e-06, + "loss": 0.0004, + "num_tokens": 60955328.0, + "reward": 3.6406068801879883, + "reward_std": 0.025088321417570114, + "rewards/reward_fn/mean": 3.6406068801879883, + "rewards/reward_fn/std": 0.025088341906666756, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 145.0, + "completions/mean_terminated_length": 145.0, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.3026908418986026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.010556742498010863, + "learning_rate": 2.76e-06, + "loss": 0.0004, + "num_tokens": 60981248.0, + "reward": 3.909937858581543, + "reward_std": 0.28644734621047974, + "rewards/reward_fn/mean": 3.909937858581543, + "rewards/reward_fn/std": 0.28644734621047974, + "step": 2621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 198.75, + "completions/mean_terminated_length": 198.75, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.30280632867536666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.010320818168111145, + "learning_rate": 2.7579999999999997e-06, + "loss": 0.0004, + "num_tokens": 61001624.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 297.0, + "completions/mean_terminated_length": 297.0, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.3029218154521307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037841796875, + "kl": 0.008399147918680683, + "learning_rate": 2.7559999999999995e-06, + "loss": 0.0003, + "num_tokens": 61022200.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 176.4375, + "completions/mean_terminated_length": 176.4375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.3030373022288948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.00831935448513832, + "learning_rate": 2.754e-06, + "loss": 0.0003, + "num_tokens": 61042918.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 168.21875, + "completions/mean_terminated_length": 168.21875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.30315278900565884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.007106233009835705, + "learning_rate": 2.7519999999999997e-06, + "loss": 0.0003, + "num_tokens": 61064653.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 283.6875, + "completions/mean_terminated_length": 283.6875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.30326827578242294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.008854344319843221, + "learning_rate": 2.75e-06, + "loss": 0.0004, + "num_tokens": 61085315.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 104.53125, + "completions/mean_terminated_length": 104.53125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.303383762559187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036865234375, + "kl": 0.00359085906893597, + "learning_rate": 2.748e-06, + "loss": 0.0001, + "num_tokens": 61101428.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 106.09375, + "completions/mean_terminated_length": 106.09375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.303499249335951, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.010358235318562947, + "learning_rate": 2.746e-06, + "loss": 0.0004, + "num_tokens": 61115895.0, + "reward": 3.9506635665893555, + "reward_std": 0.19416166841983795, + "rewards/reward_fn/mean": 3.9506635665893555, + "rewards/reward_fn/std": 0.19416168332099915, + "step": 2628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 67.6875, + "completions/mean_terminated_length": 67.6875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.3036147361127151, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.005142688471096335, + "learning_rate": 2.744e-06, + "loss": 0.0002, + "num_tokens": 61126797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 58.0, + "completions/max_terminated_length": 58.0, + "completions/mean_length": 41.75, + "completions/mean_terminated_length": 41.75, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.30373022288947915, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5, + "kl": 0.020924815489706816, + "learning_rate": 2.7419999999999998e-06, + "loss": 0.0008, + "num_tokens": 61141477.0, + "reward": 3.625, + "reward_std": 1.1845782995224, + "rewards/reward_fn/mean": 3.625, + "rewards/reward_fn/std": 1.1845782995224, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 235.625, + "completions/mean_terminated_length": 235.625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.3038457096662432, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.0061680914732278325, + "learning_rate": 2.74e-06, + "loss": 0.0002, + "num_tokens": 61164697.0, + "reward": 3.9317142963409424, + "reward_std": 0.3862823247909546, + "rewards/reward_fn/mean": 3.9317142963409424, + "rewards/reward_fn/std": 0.38628238439559937, + "step": 2631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 102.625, + "completions/mean_terminated_length": 102.625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3039611964430073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.007310486340429634, + "learning_rate": 2.738e-06, + "loss": 0.0003, + "num_tokens": 61192205.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 209.28125, + "completions/mean_terminated_length": 209.28125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3040766832197713, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.013149252336006612, + "learning_rate": 2.736e-06, + "loss": 0.0005, + "num_tokens": 61220470.0, + "reward": 2.884122371673584, + "reward_std": 0.2214922159910202, + "rewards/reward_fn/mean": 2.884122371673584, + "rewards/reward_fn/std": 0.22149218618869781, + "step": 2633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 128.21875, + "completions/mean_terminated_length": 128.21875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.3041921699965354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.007469410767953377, + "learning_rate": 2.734e-06, + "loss": 0.0003, + "num_tokens": 61240541.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 416.125, + "completions/mean_terminated_length": 416.125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.30430765677329946, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.010534177796216682, + "learning_rate": 2.7320000000000003e-06, + "loss": 0.0004, + "num_tokens": 61268609.0, + "reward": 3.853153705596924, + "reward_std": 0.49592724442481995, + "rewards/reward_fn/mean": 3.853153705596924, + "rewards/reward_fn/std": 0.49592721462249756, + "step": 2635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 97.625, + "completions/mean_terminated_length": 97.625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3044231435500635, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.009628512147173751, + "learning_rate": 2.7299999999999997e-06, + "loss": 0.0004, + "num_tokens": 61299189.0, + "reward": 3.978641986846924, + "reward_std": 0.12081914395093918, + "rewards/reward_fn/mean": 3.978641986846924, + "rewards/reward_fn/std": 0.12081912159919739, + "step": 2636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 246.90625, + "completions/mean_terminated_length": 246.90625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.3045386303268276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.009395722838235088, + "learning_rate": 2.728e-06, + "loss": 0.0004, + "num_tokens": 61322322.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 120.875, + "completions/mean_terminated_length": 120.875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.30465411710359164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.02116285242664162, + "learning_rate": 2.726e-06, + "loss": 0.0008, + "num_tokens": 61344142.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.3047696038803557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11669921875, + "kl": 0.015411488901008852, + "learning_rate": 2.724e-06, + "loss": 0.0006, + "num_tokens": 61366610.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 239.21875, + "completions/mean_terminated_length": 239.21875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.3048850906571198, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.015439295384567231, + "learning_rate": 2.722e-06, + "loss": 0.0006, + "num_tokens": 61392889.0, + "reward": 2.970059871673584, + "reward_std": 0.02939586341381073, + "rewards/reward_fn/mean": 2.970059871673584, + "rewards/reward_fn/std": 0.029395895078778267, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 200.8125, + "completions/mean_terminated_length": 200.8125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.3050005774338838, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.016872946856892668, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.0007, + "num_tokens": 61417779.0, + "reward": 3.728947639465332, + "reward_std": 0.3909694254398346, + "rewards/reward_fn/mean": 3.728947639465332, + "rewards/reward_fn/std": 0.3909693658351898, + "step": 2641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 174.28125, + "completions/mean_terminated_length": 174.28125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.3051160642106479, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.009153047401923686, + "learning_rate": 2.7179999999999996e-06, + "loss": 0.0004, + "num_tokens": 61436156.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 279.5625, + "completions/mean_terminated_length": 279.5625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.30523155098741195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.00845573699916713, + "learning_rate": 2.716e-06, + "loss": 0.0003, + "num_tokens": 61460750.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 204.34375, + "completions/mean_terminated_length": 204.34375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.305347037764176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.010333546335459687, + "learning_rate": 2.7139999999999998e-06, + "loss": 0.0004, + "num_tokens": 61479321.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 125.375, + "completions/mean_terminated_length": 125.375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.3054625245409401, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.00801220162975369, + "learning_rate": 2.712e-06, + "loss": 0.0003, + "num_tokens": 61495525.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 165.71875, + "completions/mean_terminated_length": 165.71875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.3055780113177041, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.015699259151006117, + "learning_rate": 2.71e-06, + "loss": 0.0006, + "num_tokens": 61515804.0, + "reward": 3.9400277137756348, + "reward_std": 0.23600734770298004, + "rewards/reward_fn/mean": 3.9400277137756348, + "rewards/reward_fn/std": 0.23600730299949646, + "step": 2646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 284.53125, + "completions/mean_terminated_length": 284.53125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.30569349809446816, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.008587941629230045, + "learning_rate": 2.708e-06, + "loss": 0.0003, + "num_tokens": 61537037.0, + "reward": 3.794356346130371, + "reward_std": 0.44440171122550964, + "rewards/reward_fn/mean": 3.794356346130371, + "rewards/reward_fn/std": 0.44440168142318726, + "step": 2647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 194.34375, + "completions/mean_terminated_length": 194.34375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.30580898487123226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.012889170175185427, + "learning_rate": 2.706e-06, + "loss": 0.0005, + "num_tokens": 61555128.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 75.78125, + "completions/mean_terminated_length": 75.78125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.3059244716479963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.007141072699596407, + "learning_rate": 2.704e-06, + "loss": 0.0003, + "num_tokens": 61568849.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 317.40625, + "completions/mean_terminated_length": 317.40625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.30603995842476034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.012450681431801058, + "learning_rate": 2.7019999999999997e-06, + "loss": 0.0005, + "num_tokens": 61591230.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 142.0, + "completions/mean_terminated_length": 142.0, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.30615544520152443, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.009982194867916405, + "learning_rate": 2.7e-06, + "loss": 0.0004, + "num_tokens": 61607902.0, + "reward": 3.7232003211975098, + "reward_std": 0.7441118955612183, + "rewards/reward_fn/mean": 3.7232003211975098, + "rewards/reward_fn/std": 0.7441118359565735, + "step": 2651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 143.5625, + "completions/mean_terminated_length": 143.5625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.3062709319782885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.012557713358546607, + "learning_rate": 2.698e-06, + "loss": 0.0005, + "num_tokens": 61636080.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 174.3125, + "completions/mean_terminated_length": 174.3125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.30638641875505257, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.010183119688008446, + "learning_rate": 2.696e-06, + "loss": 0.0004, + "num_tokens": 61667354.0, + "reward": 3.084779739379883, + "reward_std": 0.18551796674728394, + "rewards/reward_fn/mean": 3.084779739379883, + "rewards/reward_fn/std": 0.18551799654960632, + "step": 2653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 288.3125, + "completions/mean_terminated_length": 288.3125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.3065019055318166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.008681109844474122, + "learning_rate": 2.694e-06, + "loss": 0.0003, + "num_tokens": 61700964.0, + "reward": 3.9249024391174316, + "reward_std": 0.42481642961502075, + "rewards/reward_fn/mean": 3.9249024391174316, + "rewards/reward_fn/std": 0.424816370010376, + "step": 2654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 169.34375, + "completions/mean_terminated_length": 169.34375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.30661739230858065, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.016705179717973806, + "learning_rate": 2.6920000000000002e-06, + "loss": 0.0007, + "num_tokens": 61718095.0, + "reward": 3.6529016494750977, + "reward_std": 0.4284321665763855, + "rewards/reward_fn/mean": 3.6529016494750977, + "rewards/reward_fn/std": 0.4284321367740631, + "step": 2655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1784.0, + "completions/max_terminated_length": 1784.0, + "completions/mean_length": 475.1875, + "completions/mean_terminated_length": 475.1875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.30673287908534475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.013844817571225576, + "learning_rate": 2.6899999999999997e-06, + "loss": 0.0006, + "num_tokens": 61745589.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 165.28125, + "completions/mean_terminated_length": 165.28125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.3068483658621088, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.010757479700259864, + "learning_rate": 2.688e-06, + "loss": 0.0004, + "num_tokens": 61768990.0, + "reward": 3.8440370559692383, + "reward_std": 0.3727031648159027, + "rewards/reward_fn/mean": 3.8440370559692383, + "rewards/reward_fn/std": 0.3727031946182251, + "step": 2657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 85.375, + "completions/mean_terminated_length": 85.375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.3069638526388728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.004955860022164416, + "learning_rate": 2.6859999999999998e-06, + "loss": 0.0002, + "num_tokens": 61788842.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 279.65625, + "completions/mean_terminated_length": 279.65625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.3070793394156369, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.014058656946872361, + "learning_rate": 2.684e-06, + "loss": 0.0006, + "num_tokens": 61810495.0, + "reward": 3.931731700897217, + "reward_std": 0.3861846625804901, + "rewards/reward_fn/mean": 3.931731700897217, + "rewards/reward_fn/std": 0.3861846625804901, + "step": 2659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 201.21875, + "completions/mean_terminated_length": 201.21875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.30719482619240096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.007233549156808294, + "learning_rate": 2.682e-06, + "loss": 0.0003, + "num_tokens": 61837606.0, + "reward": 3.973529815673828, + "reward_std": 0.14973735809326172, + "rewards/reward_fn/mean": 3.973529815673828, + "rewards/reward_fn/std": 0.1497373729944229, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 215.75, + "completions/mean_terminated_length": 215.75, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.30731031296916506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.010600558583973907, + "learning_rate": 2.68e-06, + "loss": 0.0004, + "num_tokens": 61868478.0, + "reward": 3.878739833831787, + "reward_std": 0.3263198435306549, + "rewards/reward_fn/mean": 3.878739833831787, + "rewards/reward_fn/std": 0.3263198435306549, + "step": 2661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 122.59375, + "completions/mean_terminated_length": 122.59375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.3074257997459291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296875, + "kl": 0.00682371831862838, + "learning_rate": 2.678e-06, + "loss": 0.0003, + "num_tokens": 61886481.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 134.84375, + "completions/mean_terminated_length": 134.84375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.30754128652269314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.0055288703078986146, + "learning_rate": 2.676e-06, + "loss": 0.0002, + "num_tokens": 61902412.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 225.25, + "completions/mean_terminated_length": 225.25, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.30765677329945723, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.014910204175976105, + "learning_rate": 2.6739999999999997e-06, + "loss": 0.0006, + "num_tokens": 61930228.0, + "reward": 2.95534086227417, + "reward_std": 0.059960514307022095, + "rewards/reward_fn/mean": 2.95534086227417, + "rewards/reward_fn/std": 0.05996048077940941, + "step": 2664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 86.0625, + "completions/mean_terminated_length": 86.0625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.30777226007622127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.005693542327207979, + "learning_rate": 2.672e-06, + "loss": 0.0002, + "num_tokens": 61956726.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 92.1875, + "completions/mean_terminated_length": 92.1875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.3078877468529853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.014041535556316376, + "learning_rate": 2.67e-06, + "loss": 0.0006, + "num_tokens": 61982908.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 86.15625, + "completions/mean_terminated_length": 86.15625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.3080032336297494, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "kl": 0.0102555633202428, + "learning_rate": 2.668e-06, + "loss": 0.0004, + "num_tokens": 62001377.0, + "reward": 3.212075710296631, + "reward_std": 0.031064942479133606, + "rewards/reward_fn/mean": 3.212075710296631, + "rewards/reward_fn/std": 0.031064948067069054, + "step": 2667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1515.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 182.09375, + "completions/mean_terminated_length": 182.09375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.30811872040651345, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.012878877343609929, + "learning_rate": 2.666e-06, + "loss": 0.0005, + "num_tokens": 62031748.0, + "reward": 3.8301713466644287, + "reward_std": 0.5638442039489746, + "rewards/reward_fn/mean": 3.8301713466644287, + "rewards/reward_fn/std": 0.5638442039489746, + "step": 2668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 73.625, + "completions/mean_terminated_length": 73.625, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.30823420718327754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.00597961480889353, + "learning_rate": 2.664e-06, + "loss": 0.0002, + "num_tokens": 62052888.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 87.4375, + "completions/mean_terminated_length": 87.4375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.3083496939600416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.008272506165667437, + "learning_rate": 2.6619999999999997e-06, + "loss": 0.0003, + "num_tokens": 62080262.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 215.8125, + "completions/mean_terminated_length": 215.8125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.3084651807368056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.0156619996560039, + "learning_rate": 2.66e-06, + "loss": 0.0006, + "num_tokens": 62099072.0, + "reward": 3.976386308670044, + "reward_std": 0.13357891142368317, + "rewards/reward_fn/mean": 3.976386308670044, + "rewards/reward_fn/std": 0.13357891142368317, + "step": 2671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 145.96875, + "completions/mean_terminated_length": 145.96875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3085806675135697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.0141579063856625, + "learning_rate": 2.658e-06, + "loss": 0.0006, + "num_tokens": 62118879.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 147.15625, + "completions/mean_terminated_length": 147.15625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.30869615429033376, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.01719076833978761, + "learning_rate": 2.656e-06, + "loss": 0.0007, + "num_tokens": 62144196.0, + "reward": 3.9545655250549316, + "reward_std": 0.1840795874595642, + "rewards/reward_fn/mean": 3.9545655250549316, + "rewards/reward_fn/std": 0.1840796172618866, + "step": 2673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 315.125, + "completions/mean_terminated_length": 315.125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.3088116410670978, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.011371815140591934, + "learning_rate": 2.654e-06, + "loss": 0.0005, + "num_tokens": 62169768.0, + "reward": 3.8999452590942383, + "reward_std": 0.43182262778282166, + "rewards/reward_fn/mean": 3.8999452590942383, + "rewards/reward_fn/std": 0.43182259798049927, + "step": 2674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 222.75, + "completions/mean_terminated_length": 222.75, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.3089271278438619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.00981105388927972, + "learning_rate": 2.652e-06, + "loss": 0.0004, + "num_tokens": 62198272.0, + "reward": 3.766052722930908, + "reward_std": 0.5037506222724915, + "rewards/reward_fn/mean": 3.766052722930908, + "rewards/reward_fn/std": 0.5037506222724915, + "step": 2675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 404.46875, + "completions/mean_terminated_length": 404.46875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.30904261462062593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.0099595159990713, + "learning_rate": 2.6499999999999996e-06, + "loss": 0.0004, + "num_tokens": 62223503.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 166.125, + "completions/mean_terminated_length": 166.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.30915810139739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042724609375, + "kl": 0.00608650672074873, + "learning_rate": 2.648e-06, + "loss": 0.0002, + "num_tokens": 62251283.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 151.46875, + "completions/mean_terminated_length": 151.46875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.30927358817415407, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "kl": 0.014733784031705, + "learning_rate": 2.6459999999999997e-06, + "loss": 0.0006, + "num_tokens": 62267938.0, + "reward": 3.7037787437438965, + "reward_std": 0.7963102459907532, + "rewards/reward_fn/mean": 3.7037787437438965, + "rewards/reward_fn/std": 0.796310305595398, + "step": 2678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 132.3125, + "completions/mean_terminated_length": 132.3125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.3093890749509181, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.008630143391201273, + "learning_rate": 2.644e-06, + "loss": 0.0003, + "num_tokens": 62290220.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 253.65625, + "completions/mean_terminated_length": 253.65625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.3095045617276822, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.010601203175610863, + "learning_rate": 2.642e-06, + "loss": 0.0004, + "num_tokens": 62311073.0, + "reward": 3.8605144023895264, + "reward_std": 0.5488749146461487, + "rewards/reward_fn/mean": 3.8605144023895264, + "rewards/reward_fn/std": 0.5488749146461487, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 99.34375, + "completions/mean_terminated_length": 99.34375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.30962004850444624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.006823665229603648, + "learning_rate": 2.64e-06, + "loss": 0.0003, + "num_tokens": 62330700.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 137.4375, + "completions/mean_terminated_length": 137.4375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.3097355352812103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0546875, + "kl": 0.009586403422872536, + "learning_rate": 2.638e-06, + "loss": 0.0004, + "num_tokens": 62355386.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 280.4375, + "completions/mean_terminated_length": 280.4375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.3098510220579744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.012047836920828559, + "learning_rate": 2.636e-06, + "loss": 0.0005, + "num_tokens": 62376360.0, + "reward": 3.926863431930542, + "reward_std": 0.41372284293174744, + "rewards/reward_fn/mean": 3.926863431930542, + "rewards/reward_fn/std": 0.41372284293174744, + "step": 2683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 351.875, + "completions/mean_terminated_length": 351.875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.3099665088347384, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.01307351715513505, + "learning_rate": 2.6339999999999997e-06, + "loss": 0.0005, + "num_tokens": 62409860.0, + "reward": 3.9428963661193848, + "reward_std": 0.22524656355381012, + "rewards/reward_fn/mean": 3.9428963661193848, + "rewards/reward_fn/std": 0.22524656355381012, + "step": 2684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 198.75, + "completions/mean_terminated_length": 198.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.31008199561150246, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.008219933835789561, + "learning_rate": 2.632e-06, + "loss": 0.0003, + "num_tokens": 62424060.0, + "reward": 3.929816961288452, + "reward_std": 0.39701491594314575, + "rewards/reward_fn/mean": 3.929816961288452, + "rewards/reward_fn/std": 0.39701494574546814, + "step": 2685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 165.125, + "completions/mean_terminated_length": 165.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.31019748238826655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.010090269439388067, + "learning_rate": 2.63e-06, + "loss": 0.0004, + "num_tokens": 62441440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 326.875, + "completions/mean_terminated_length": 326.875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.3103129691650306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044921875, + "kl": 0.008819265072816052, + "learning_rate": 2.628e-06, + "loss": 0.0004, + "num_tokens": 62467452.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 340.5625, + "completions/mean_terminated_length": 340.5625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.3104284559417947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0361328125, + "kl": 0.008459456614218652, + "learning_rate": 2.626e-06, + "loss": 0.0003, + "num_tokens": 62490414.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 153.75, + "completions/mean_terminated_length": 153.75, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.31054394271855873, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.017879346502013505, + "learning_rate": 2.624e-06, + "loss": 0.0007, + "num_tokens": 62512518.0, + "reward": 3.9380416870117188, + "reward_std": 0.24966667592525482, + "rewards/reward_fn/mean": 3.9380416870117188, + "rewards/reward_fn/std": 0.2496667057275772, + "step": 2689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 108.46875, + "completions/mean_terminated_length": 108.46875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.31065942949532277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.0057025334135687444, + "learning_rate": 2.6219999999999996e-06, + "loss": 0.0002, + "num_tokens": 62543317.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 144.90625, + "completions/mean_terminated_length": 144.90625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.31077491627208687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.012791574583388865, + "learning_rate": 2.62e-06, + "loss": 0.0005, + "num_tokens": 62564658.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 114.9375, + "completions/mean_terminated_length": 114.9375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.3108904030488509, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.013606353284558281, + "learning_rate": 2.6179999999999998e-06, + "loss": 0.0005, + "num_tokens": 62579216.0, + "reward": 3.9798471927642822, + "reward_std": 0.1140015572309494, + "rewards/reward_fn/mean": 3.9798471927642822, + "rewards/reward_fn/std": 0.1140015721321106, + "step": 2692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 113.96875, + "completions/mean_terminated_length": 113.96875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.31100588982561495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.010795322959893383, + "learning_rate": 2.616e-06, + "loss": 0.0004, + "num_tokens": 62608751.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 72.34375, + "completions/mean_terminated_length": 72.34375, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.31112137660237904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.162109375, + "kl": 0.014000861774547957, + "learning_rate": 2.614e-06, + "loss": 0.0006, + "num_tokens": 62629050.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 175.96875, + "completions/mean_terminated_length": 175.96875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.3112368633791431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.01563686279405374, + "learning_rate": 2.612e-06, + "loss": 0.0006, + "num_tokens": 62649401.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 149.1875, + "completions/mean_terminated_length": 149.1875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.3113523501559072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.0102797756990185, + "learning_rate": 2.6099999999999996e-06, + "loss": 0.0004, + "num_tokens": 62676287.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 278.71875, + "completions/mean_terminated_length": 278.71875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.3114678369326712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.009276467499148566, + "learning_rate": 2.608e-06, + "loss": 0.0004, + "num_tokens": 62700214.0, + "reward": 3.7067770957946777, + "reward_std": 0.5430790185928345, + "rewards/reward_fn/mean": 3.7067770957946777, + "rewards/reward_fn/std": 0.5430789589881897, + "step": 2697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 171.59375, + "completions/mean_terminated_length": 171.59375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.31158332370943526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.0075335383444326, + "learning_rate": 2.6059999999999997e-06, + "loss": 0.0003, + "num_tokens": 62718441.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 192.1875, + "completions/mean_terminated_length": 192.1875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.31169881048619935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.004197133534034947, + "learning_rate": 2.604e-06, + "loss": 0.0002, + "num_tokens": 62749135.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 167.03125, + "completions/mean_terminated_length": 167.03125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3118142972629634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.009906495353789069, + "learning_rate": 2.602e-06, + "loss": 0.0004, + "num_tokens": 62766640.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 87.71875, + "completions/mean_terminated_length": 87.71875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.31192978403972743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.011758772008761298, + "learning_rate": 2.6e-06, + "loss": 0.0005, + "num_tokens": 62792903.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 281.6875, + "completions/mean_terminated_length": 281.6875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3120452708164915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.010006962991610635, + "learning_rate": 2.598e-06, + "loss": 0.0004, + "num_tokens": 62824637.0, + "reward": 2.736880302429199, + "reward_std": 0.3049788177013397, + "rewards/reward_fn/mean": 2.736880302429199, + "rewards/reward_fn/std": 0.3049788475036621, + "step": 2702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 194.09375, + "completions/mean_terminated_length": 194.09375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.31216075759325557, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.018706613234826364, + "learning_rate": 2.596e-06, + "loss": 0.0007, + "num_tokens": 62853536.0, + "reward": 3.426295757293701, + "reward_std": 0.2180057168006897, + "rewards/reward_fn/mean": 3.426295757293701, + "rewards/reward_fn/std": 0.2180057168006897, + "step": 2703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 181.96875, + "completions/mean_terminated_length": 181.96875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.3122762443700196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.007850163448893, + "learning_rate": 2.5939999999999996e-06, + "loss": 0.0003, + "num_tokens": 62872319.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 171.40625, + "completions/mean_terminated_length": 171.40625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.3123917311467837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.009611308632884175, + "learning_rate": 2.592e-06, + "loss": 0.0004, + "num_tokens": 62889324.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 99.90625, + "completions/mean_terminated_length": 99.90625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.31250721792354774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.006396896929800278, + "learning_rate": 2.5899999999999998e-06, + "loss": 0.0003, + "num_tokens": 62906377.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 149.90625, + "completions/mean_terminated_length": 149.90625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.31262270470031184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.008559439324017148, + "learning_rate": 2.588e-06, + "loss": 0.0003, + "num_tokens": 62935814.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 133.0625, + "completions/mean_terminated_length": 133.0625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.3127381914770759, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.011036360447178595, + "learning_rate": 2.586e-06, + "loss": 0.0004, + "num_tokens": 62955976.0, + "reward": 3.9017152786254883, + "reward_std": 0.2652026116847992, + "rewards/reward_fn/mean": 3.9017152786254883, + "rewards/reward_fn/std": 0.2652025520801544, + "step": 2708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 123.03125, + "completions/mean_terminated_length": 123.03125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3128536782538399, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.015237348961818498, + "learning_rate": 2.584e-06, + "loss": 0.0006, + "num_tokens": 62973609.0, + "reward": 3.933962345123291, + "reward_std": 0.3735656440258026, + "rewards/reward_fn/mean": 3.933962345123291, + "rewards/reward_fn/std": 0.3735656440258026, + "step": 2709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 82.3125, + "completions/mean_terminated_length": 82.3125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.312969165030604, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.875, + "kl": 0.0066992974061577115, + "learning_rate": 2.5819999999999996e-06, + "loss": 0.0003, + "num_tokens": 62984115.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 245.6875, + "completions/mean_terminated_length": 245.6875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.31308465180736805, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.01170676363108214, + "learning_rate": 2.58e-06, + "loss": 0.0005, + "num_tokens": 63001961.0, + "reward": 3.3352606296539307, + "reward_std": 0.42461028695106506, + "rewards/reward_fn/mean": 3.3352606296539307, + "rewards/reward_fn/std": 0.4246101975440979, + "step": 2711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 189.0625, + "completions/mean_terminated_length": 189.0625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.3132001385841321, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.012482154153985903, + "learning_rate": 2.5779999999999997e-06, + "loss": 0.0005, + "num_tokens": 63031435.0, + "reward": 3.8459088802337646, + "reward_std": 0.5382976531982422, + "rewards/reward_fn/mean": 3.8459088802337646, + "rewards/reward_fn/std": 0.5382976531982422, + "step": 2712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 240.9375, + "completions/mean_terminated_length": 240.9375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.3133156253608962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.0174019607657101, + "learning_rate": 2.576e-06, + "loss": 0.0007, + "num_tokens": 63057865.0, + "reward": 3.7589075565338135, + "reward_std": 0.5530949831008911, + "rewards/reward_fn/mean": 3.7589075565338135, + "rewards/reward_fn/std": 0.5530950427055359, + "step": 2713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 117.03125, + "completions/mean_terminated_length": 117.03125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.31343111213766023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.009930548127158545, + "learning_rate": 2.574e-06, + "loss": 0.0004, + "num_tokens": 63077450.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 132.78125, + "completions/mean_terminated_length": 132.78125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.3135465989144243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.008102630876237527, + "learning_rate": 2.572e-06, + "loss": 0.0003, + "num_tokens": 63095523.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 196.4375, + "completions/mean_terminated_length": 196.4375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.31366208569118836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.009221018510288559, + "learning_rate": 2.5699999999999995e-06, + "loss": 0.0004, + "num_tokens": 63122993.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 142.75, + "completions/mean_terminated_length": 142.75, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.3137775724679524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.011102884338470176, + "learning_rate": 2.568e-06, + "loss": 0.0004, + "num_tokens": 63137353.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 191.4375, + "completions/mean_terminated_length": 191.4375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.3138930592447165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037841796875, + "kl": 0.007620092568686232, + "learning_rate": 2.5659999999999997e-06, + "loss": 0.0003, + "num_tokens": 63155671.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 205.46875, + "completions/mean_terminated_length": 205.46875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.31400854602148054, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.010923708301561419, + "learning_rate": 2.564e-06, + "loss": 0.0004, + "num_tokens": 63180518.0, + "reward": 3.864438056945801, + "reward_std": 0.32044923305511475, + "rewards/reward_fn/mean": 3.864438056945801, + "rewards/reward_fn/std": 0.32044920325279236, + "step": 2719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 88.6875, + "completions/mean_terminated_length": 88.6875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.3141240327982446, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.014698137456434779, + "learning_rate": 2.5619999999999998e-06, + "loss": 0.0006, + "num_tokens": 63204028.0, + "reward": 3.4629855155944824, + "reward_std": 0.2874915599822998, + "rewards/reward_fn/mean": 3.4629855155944824, + "rewards/reward_fn/std": 0.2874916195869446, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 214.53125, + "completions/mean_terminated_length": 214.53125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3142395195750087, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.010357199018471874, + "learning_rate": 2.56e-06, + "loss": 0.0004, + "num_tokens": 63232781.0, + "reward": 3.259221076965332, + "reward_std": 0.40647897124290466, + "rewards/reward_fn/mean": 3.259221076965332, + "rewards/reward_fn/std": 0.40647897124290466, + "step": 2721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 227.3125, + "completions/mean_terminated_length": 227.3125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.3143550063517727, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.01384608844819013, + "learning_rate": 2.558e-06, + "loss": 0.0006, + "num_tokens": 63259863.0, + "reward": 3.9119999408721924, + "reward_std": 0.2783736288547516, + "rewards/reward_fn/mean": 3.9119999408721924, + "rewards/reward_fn/std": 0.2783735990524292, + "step": 2722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 90.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 66.25, + "completions/mean_terminated_length": 66.25, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.3144704931285368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.314453125, + "kl": 0.01695368450600654, + "learning_rate": 2.5559999999999998e-06, + "loss": 0.0007, + "num_tokens": 63277567.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 301.8125, + "completions/mean_terminated_length": 301.8125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.31458597990530085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.012405559828039259, + "learning_rate": 2.5539999999999996e-06, + "loss": 0.0005, + "num_tokens": 63298745.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 139.21875, + "completions/mean_terminated_length": 139.21875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3147014666820649, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.009328622698376421, + "learning_rate": 2.552e-06, + "loss": 0.0004, + "num_tokens": 63315520.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 232.8125, + "completions/mean_terminated_length": 232.8125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.314816953458829, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.014673634796054102, + "learning_rate": 2.5499999999999997e-06, + "loss": 0.0006, + "num_tokens": 63346010.0, + "reward": 3.6623330116271973, + "reward_std": 0.9467941522598267, + "rewards/reward_fn/mean": 3.6623330116271973, + "rewards/reward_fn/std": 0.9467940926551819, + "step": 2726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 128.375, + "completions/mean_terminated_length": 128.375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.314932440235593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.0077944597287569195, + "learning_rate": 2.548e-06, + "loss": 0.0003, + "num_tokens": 63362534.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 79.375, + "completions/mean_terminated_length": 79.375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.31504792701235707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.006189575400640024, + "learning_rate": 2.546e-06, + "loss": 0.0002, + "num_tokens": 63377106.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 79.96875, + "completions/mean_terminated_length": 79.96875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.31516341378912116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.00667865906143561, + "learning_rate": 2.544e-06, + "loss": 0.0003, + "num_tokens": 63396977.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 75.9375, + "completions/mean_terminated_length": 75.9375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.3152789005658852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.0072476437635486946, + "learning_rate": 2.5419999999999995e-06, + "loss": 0.0003, + "num_tokens": 63411375.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 169.53125, + "completions/mean_terminated_length": 169.53125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.31539438734264924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0478515625, + "kl": 0.007552181104983902, + "learning_rate": 2.54e-06, + "loss": 0.0003, + "num_tokens": 63439840.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 153.0625, + "completions/mean_terminated_length": 153.0625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.31550987411941334, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.009288132016081363, + "learning_rate": 2.5379999999999997e-06, + "loss": 0.0004, + "num_tokens": 63458146.0, + "reward": 3.9324188232421875, + "reward_std": 0.3822965919971466, + "rewards/reward_fn/mean": 3.9324188232421875, + "rewards/reward_fn/std": 0.3822965919971466, + "step": 2732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 217.9375, + "completions/mean_terminated_length": 217.9375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.3156253608961774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.012366659655526746, + "learning_rate": 2.536e-06, + "loss": 0.0005, + "num_tokens": 63490752.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 465.71875, + "completions/mean_terminated_length": 465.71875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.31574084767294147, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.00874054200539831, + "learning_rate": 2.534e-06, + "loss": 0.0003, + "num_tokens": 63525175.0, + "reward": 3.327127695083618, + "reward_std": 0.3839357793331146, + "rewards/reward_fn/mean": 3.327127695083618, + "rewards/reward_fn/std": 0.38393574953079224, + "step": 2734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 121.40625, + "completions/mean_terminated_length": 121.40625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.3158563344497055, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.01678837521467358, + "learning_rate": 2.532e-06, + "loss": 0.0007, + "num_tokens": 63548708.0, + "reward": 3.9299473762512207, + "reward_std": 0.39627763628959656, + "rewards/reward_fn/mean": 3.9299473762512207, + "rewards/reward_fn/std": 0.39627763628959656, + "step": 2735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 434.21875, + "completions/mean_terminated_length": 434.21875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.31597182122646955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.009420979229616933, + "learning_rate": 2.5299999999999995e-06, + "loss": 0.0004, + "num_tokens": 63572427.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 163.9375, + "completions/mean_terminated_length": 163.9375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.31608730800323365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.010215885573416017, + "learning_rate": 2.5279999999999998e-06, + "loss": 0.0004, + "num_tokens": 63585673.0, + "reward": 3.961101770401001, + "reward_std": 0.2200412154197693, + "rewards/reward_fn/mean": 3.961101770401001, + "rewards/reward_fn/std": 0.2200412005186081, + "step": 2737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 350.125, + "completions/mean_terminated_length": 350.125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.3162027947799977, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.010093496675835922, + "learning_rate": 2.5259999999999996e-06, + "loss": 0.0004, + "num_tokens": 63608973.0, + "reward": 3.9322774410247803, + "reward_std": 0.38309669494628906, + "rewards/reward_fn/mean": 3.9322774410247803, + "rewards/reward_fn/std": 0.3830966651439667, + "step": 2738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 246.6875, + "completions/mean_terminated_length": 246.6875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.3163182815567617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.008793192937446292, + "learning_rate": 2.524e-06, + "loss": 0.0004, + "num_tokens": 63632323.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 86.4375, + "completions/mean_terminated_length": 86.4375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.3164337683335258, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.125, + "kl": 0.013341444864636287, + "learning_rate": 2.5219999999999997e-06, + "loss": 0.0005, + "num_tokens": 63656369.0, + "reward": 3.233304023742676, + "reward_std": 0.04133351892232895, + "rewards/reward_fn/mean": 3.233304023742676, + "rewards/reward_fn/std": 0.04133349284529686, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 185.90625, + "completions/mean_terminated_length": 185.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.31654925511028986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0478515625, + "kl": 0.006013278602040373, + "learning_rate": 2.52e-06, + "loss": 0.0002, + "num_tokens": 63686062.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 164.71875, + "completions/mean_terminated_length": 164.71875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.31666474188705396, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.009512087672192138, + "learning_rate": 2.518e-06, + "loss": 0.0004, + "num_tokens": 63707525.0, + "reward": 3.969661235809326, + "reward_std": 0.17162194848060608, + "rewards/reward_fn/mean": 3.969661235809326, + "rewards/reward_fn/std": 0.1716219186782837, + "step": 2742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 146.5, + "completions/mean_terminated_length": 146.5, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.316780228663818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.013452917206450365, + "learning_rate": 2.5159999999999997e-06, + "loss": 0.0005, + "num_tokens": 63734933.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 153.21875, + "completions/mean_terminated_length": 153.21875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.31689571544058204, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.010403384163510054, + "learning_rate": 2.5139999999999996e-06, + "loss": 0.0004, + "num_tokens": 63763324.0, + "reward": 3.960024118423462, + "reward_std": 0.1724734753370285, + "rewards/reward_fn/mean": 3.960024118423462, + "rewards/reward_fn/std": 0.1724734753370285, + "step": 2744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 147.53125, + "completions/mean_terminated_length": 147.53125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.31701120221734613, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "kl": 0.011690674960846081, + "learning_rate": 2.512e-06, + "loss": 0.0005, + "num_tokens": 63793165.0, + "reward": 3.6536521911621094, + "reward_std": 0.8177436590194702, + "rewards/reward_fn/mean": 3.6536521911621094, + "rewards/reward_fn/std": 0.8177435994148254, + "step": 2745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 113.46875, + "completions/mean_terminated_length": 113.46875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.3171266889941102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.011311905167531222, + "learning_rate": 2.5099999999999997e-06, + "loss": 0.0005, + "num_tokens": 63809468.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 312.1875, + "completions/mean_terminated_length": 312.1875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.3172421757708742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.009144694515271112, + "learning_rate": 2.508e-06, + "loss": 0.0004, + "num_tokens": 63829090.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 197.59375, + "completions/mean_terminated_length": 197.59375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.3173576625476383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.00980408862233162, + "learning_rate": 2.506e-06, + "loss": 0.0004, + "num_tokens": 63857845.0, + "reward": 3.4490761756896973, + "reward_std": 0.532264769077301, + "rewards/reward_fn/mean": 3.4490761756896973, + "rewards/reward_fn/std": 0.5322647094726562, + "step": 2748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 799.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 240.59375, + "completions/mean_terminated_length": 240.59375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.31747314932440235, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.006936740974197164, + "learning_rate": 2.504e-06, + "loss": 0.0003, + "num_tokens": 63883848.0, + "reward": 3.861337900161743, + "reward_std": 0.5456666946411133, + "rewards/reward_fn/mean": 3.861337900161743, + "rewards/reward_fn/std": 0.5456666946411133, + "step": 2749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 110.8125, + "completions/mean_terminated_length": 110.8125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.31758863610116644, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.012125811357691418, + "learning_rate": 2.5019999999999995e-06, + "loss": 0.0005, + "num_tokens": 63903490.0, + "reward": 3.9640579223632812, + "reward_std": 0.2033190280199051, + "rewards/reward_fn/mean": 3.9640579223632812, + "rewards/reward_fn/std": 0.2033190131187439, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 329.1875, + "completions/mean_terminated_length": 329.1875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.3177041228779305, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.016102849505841732, + "learning_rate": 2.4999999999999998e-06, + "loss": 0.0006, + "num_tokens": 63933032.0, + "reward": 3.4731035232543945, + "reward_std": 0.7452911138534546, + "rewards/reward_fn/mean": 3.4731035232543945, + "rewards/reward_fn/std": 0.7452911734580994, + "step": 2751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 413.875, + "completions/mean_terminated_length": 413.875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.3178196096546945, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.012367506511509418, + "learning_rate": 2.498e-06, + "loss": 0.0005, + "num_tokens": 63957220.0, + "reward": 3.7245872020721436, + "reward_std": 0.7403780817985535, + "rewards/reward_fn/mean": 3.7245872020721436, + "rewards/reward_fn/std": 0.7403780221939087, + "step": 2752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 260.15625, + "completions/mean_terminated_length": 260.15625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.3179350964314586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.012594248008099385, + "learning_rate": 2.496e-06, + "loss": 0.0005, + "num_tokens": 63986569.0, + "reward": 3.361686944961548, + "reward_std": 0.4659510552883148, + "rewards/reward_fn/mean": 3.361686944961548, + "rewards/reward_fn/std": 0.4659511148929596, + "step": 2753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 81.90625, + "completions/mean_terminated_length": 81.90625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.31805058320822266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.008066423109994503, + "learning_rate": 2.494e-06, + "loss": 0.0003, + "num_tokens": 64010598.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 115.3125, + "completions/mean_terminated_length": 115.3125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.3181660699849867, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.008611628429207485, + "learning_rate": 2.492e-06, + "loss": 0.0003, + "num_tokens": 64027952.0, + "reward": 3.969980001449585, + "reward_std": 0.1698189228773117, + "rewards/reward_fn/mean": 3.969980001449585, + "rewards/reward_fn/std": 0.1698189675807953, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 72.6875, + "completions/mean_terminated_length": 72.6875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.3182815567617508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.006834873449406587, + "learning_rate": 2.4900000000000003e-06, + "loss": 0.0003, + "num_tokens": 64039558.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 220.8125, + "completions/mean_terminated_length": 220.8125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.31839704353851483, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.01050351435696939, + "learning_rate": 2.4879999999999997e-06, + "loss": 0.0004, + "num_tokens": 64062528.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 69.03125, + "completions/mean_terminated_length": 69.03125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.3185125303152789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.006101208009567927, + "learning_rate": 2.486e-06, + "loss": 0.0002, + "num_tokens": 64075425.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 130.5, + "completions/mean_terminated_length": 130.5, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.31862801709204297, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.012136861885664985, + "learning_rate": 2.484e-06, + "loss": 0.0005, + "num_tokens": 64102993.0, + "reward": 3.978485345840454, + "reward_std": 0.12170526385307312, + "rewards/reward_fn/mean": 3.978485345840454, + "rewards/reward_fn/std": 0.12170525640249252, + "step": 2759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 150.625, + "completions/mean_terminated_length": 150.625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.318743503868807, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.011136336353956722, + "learning_rate": 2.482e-06, + "loss": 0.0004, + "num_tokens": 64130437.0, + "reward": 3.899794101715088, + "reward_std": 0.41951990127563477, + "rewards/reward_fn/mean": 3.899794101715088, + "rewards/reward_fn/std": 0.4195198714733124, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 183.96875, + "completions/mean_terminated_length": 183.96875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3188589906455711, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.009000279744213913, + "learning_rate": 2.48e-06, + "loss": 0.0004, + "num_tokens": 64153252.0, + "reward": 3.4745144844055176, + "reward_std": 0.5517653226852417, + "rewards/reward_fn/mean": 3.4745144844055176, + "rewards/reward_fn/std": 0.5517652630805969, + "step": 2761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 320.28125, + "completions/mean_terminated_length": 320.28125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.31897447742233515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.00822222011629492, + "learning_rate": 2.4780000000000002e-06, + "loss": 0.0003, + "num_tokens": 64173133.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 215.03125, + "completions/mean_terminated_length": 215.03125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.3190899641990992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.010119085811311379, + "learning_rate": 2.4759999999999997e-06, + "loss": 0.0004, + "num_tokens": 64192366.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 361.34375, + "completions/mean_terminated_length": 361.34375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.3192054509758633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.01033972312870901, + "learning_rate": 2.474e-06, + "loss": 0.0004, + "num_tokens": 64214393.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 234.96875, + "completions/mean_terminated_length": 234.96875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.3193209377526273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.01108829420991242, + "learning_rate": 2.472e-06, + "loss": 0.0004, + "num_tokens": 64242680.0, + "reward": 3.933398723602295, + "reward_std": 0.37675321102142334, + "rewards/reward_fn/mean": 3.933398723602295, + "rewards/reward_fn/std": 0.3767532408237457, + "step": 2765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 86.15625, + "completions/mean_terminated_length": 86.15625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.31943642452939136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.008674213117046747, + "learning_rate": 2.47e-06, + "loss": 0.0003, + "num_tokens": 64255005.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 83.375, + "completions/mean_terminated_length": 83.375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.31955191130615546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11669921875, + "kl": 0.010535722514759982, + "learning_rate": 2.468e-06, + "loss": 0.0004, + "num_tokens": 64273769.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 82.6875, + "completions/mean_terminated_length": 82.6875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3196673980829195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.009927581006195396, + "learning_rate": 2.466e-06, + "loss": 0.0004, + "num_tokens": 64298175.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 350.53125, + "completions/mean_terminated_length": 350.53125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.3197828848596836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.011916253788513131, + "learning_rate": 2.464e-06, + "loss": 0.0005, + "num_tokens": 64321424.0, + "reward": 3.9308063983917236, + "reward_std": 0.3914179503917694, + "rewards/reward_fn/mean": 3.9308063983917236, + "rewards/reward_fn/std": 0.39141789078712463, + "step": 2769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 366.6875, + "completions/mean_terminated_length": 366.6875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.31989837163644763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.012068418713170104, + "learning_rate": 2.462e-06, + "loss": 0.0005, + "num_tokens": 64346342.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 193.40625, + "completions/mean_terminated_length": 193.40625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.32001385841321167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.008573691637138836, + "learning_rate": 2.4599999999999997e-06, + "loss": 0.0003, + "num_tokens": 64364435.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 78.40625, + "completions/mean_terminated_length": 78.40625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.32012934518997577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.005067975349447806, + "learning_rate": 2.458e-06, + "loss": 0.0002, + "num_tokens": 64382944.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 82.96875, + "completions/mean_terminated_length": 82.96875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3202448319667398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.014801958197494969, + "learning_rate": 2.456e-06, + "loss": 0.0006, + "num_tokens": 64395967.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 260.8125, + "completions/mean_terminated_length": 260.8125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.32036031874350385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.00925596045999555, + "learning_rate": 2.454e-06, + "loss": 0.0004, + "num_tokens": 64419641.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 223.8125, + "completions/mean_terminated_length": 223.8125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.32047580552026794, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.010242340329568833, + "learning_rate": 2.452e-06, + "loss": 0.0004, + "num_tokens": 64445107.0, + "reward": 3.9762930870056152, + "reward_std": 0.1341068148612976, + "rewards/reward_fn/mean": 3.9762930870056152, + "rewards/reward_fn/std": 0.1341067999601364, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 282.09375, + "completions/mean_terminated_length": 282.09375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.320591292297032, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.007813724980223924, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0003, + "num_tokens": 64469750.0, + "reward": 3.856760025024414, + "reward_std": 0.5636440515518188, + "rewards/reward_fn/mean": 3.856760025024414, + "rewards/reward_fn/std": 0.5636440515518188, + "step": 2776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 182.15625, + "completions/mean_terminated_length": 182.15625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.3207067790737961, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.009062898127012886, + "learning_rate": 2.4479999999999997e-06, + "loss": 0.0004, + "num_tokens": 64499419.0, + "reward": 3.9760313034057617, + "reward_std": 0.13558778166770935, + "rewards/reward_fn/mean": 3.9760313034057617, + "rewards/reward_fn/std": 0.13558779656887054, + "step": 2777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 338.5, + "completions/mean_terminated_length": 338.5, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.3208222658505601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.010350967888371088, + "learning_rate": 2.446e-06, + "loss": 0.0004, + "num_tokens": 64522955.0, + "reward": 3.9312455654144287, + "reward_std": 0.3889341950416565, + "rewards/reward_fn/mean": 3.9312455654144287, + "rewards/reward_fn/std": 0.38893425464630127, + "step": 2778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 66.25, + "completions/mean_terminated_length": 66.25, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.32093775262732416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.0076563617258216254, + "learning_rate": 2.444e-06, + "loss": 0.0003, + "num_tokens": 64534547.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 173.65625, + "completions/mean_terminated_length": 173.65625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.32105323940408825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.011560576356714591, + "learning_rate": 2.442e-06, + "loss": 0.0005, + "num_tokens": 64561896.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 144.3125, + "completions/mean_terminated_length": 144.3125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.3211687261808523, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.006095185519370716, + "learning_rate": 2.44e-06, + "loss": 0.0002, + "num_tokens": 64581650.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 93.71875, + "completions/mean_terminated_length": 93.71875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.32128421295761633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.01221131759666605, + "learning_rate": 2.438e-06, + "loss": 0.0005, + "num_tokens": 64611145.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 208.75, + "completions/mean_terminated_length": 208.75, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.32139969973438043, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.013081537385005504, + "learning_rate": 2.4359999999999996e-06, + "loss": 0.0005, + "num_tokens": 64640033.0, + "reward": 3.2655138969421387, + "reward_std": 0.19561885297298431, + "rewards/reward_fn/mean": 3.2655138969421387, + "rewards/reward_fn/std": 0.1956188827753067, + "step": 2783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 136.8125, + "completions/mean_terminated_length": 136.8125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.32151518651114447, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.011057762749260291, + "learning_rate": 2.434e-06, + "loss": 0.0004, + "num_tokens": 64662683.0, + "reward": 3.421008586883545, + "reward_std": 0.11005322635173798, + "rewards/reward_fn/mean": 3.421008586883545, + "rewards/reward_fn/std": 0.11005321890115738, + "step": 2784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 219.78125, + "completions/mean_terminated_length": 219.78125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.3216306732879085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.01104555153142428, + "learning_rate": 2.4319999999999998e-06, + "loss": 0.0004, + "num_tokens": 64681204.0, + "reward": 3.9277687072753906, + "reward_std": 0.4086018204689026, + "rewards/reward_fn/mean": 3.9277687072753906, + "rewards/reward_fn/std": 0.4086018204689026, + "step": 2785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 214.84375, + "completions/mean_terminated_length": 214.84375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.3217461600646726, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.014686491995234974, + "learning_rate": 2.43e-06, + "loss": 0.0006, + "num_tokens": 64701295.0, + "reward": 3.216512441635132, + "reward_std": 0.2746817469596863, + "rewards/reward_fn/mean": 3.216512441635132, + "rewards/reward_fn/std": 0.27468177676200867, + "step": 2786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 141.0625, + "completions/mean_terminated_length": 141.0625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.32186164684143664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.008947720685682725, + "learning_rate": 2.428e-06, + "loss": 0.0004, + "num_tokens": 64728945.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 211.9375, + "completions/mean_terminated_length": 211.9375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.32197713361820074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.014358258064021356, + "learning_rate": 2.426e-06, + "loss": 0.0006, + "num_tokens": 64752431.0, + "reward": 3.327171802520752, + "reward_std": 0.2007744461297989, + "rewards/reward_fn/mean": 3.327171802520752, + "rewards/reward_fn/std": 0.20077446103096008, + "step": 2788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1488.0, + "completions/max_terminated_length": 1488.0, + "completions/mean_length": 392.75, + "completions/mean_terminated_length": 392.75, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.3220926203949648, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.011425064309150912, + "learning_rate": 2.424e-06, + "loss": 0.0005, + "num_tokens": 64777383.0, + "reward": 3.525834083557129, + "reward_std": 0.7761809825897217, + "rewards/reward_fn/mean": 3.525834083557129, + "rewards/reward_fn/std": 0.7761809229850769, + "step": 2789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 82.15625, + "completions/mean_terminated_length": 82.15625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.3222081071717288, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.006457480289100204, + "learning_rate": 2.422e-06, + "loss": 0.0003, + "num_tokens": 64787788.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 144.6875, + "completions/mean_terminated_length": 144.6875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.3223235939484929, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.01859411332407035, + "learning_rate": 2.4199999999999997e-06, + "loss": 0.0007, + "num_tokens": 64814498.0, + "reward": 3.988279104232788, + "reward_std": 0.06630308926105499, + "rewards/reward_fn/mean": 3.988279104232788, + "rewards/reward_fn/std": 0.06630310416221619, + "step": 2791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 119.1875, + "completions/mean_terminated_length": 119.1875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.32243908072525695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.005988231445371639, + "learning_rate": 2.418e-06, + "loss": 0.0002, + "num_tokens": 64839752.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 241.40625, + "completions/mean_terminated_length": 241.40625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.322554567502021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.008087204492767341, + "learning_rate": 2.416e-06, + "loss": 0.0003, + "num_tokens": 64861973.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 267.96875, + "completions/mean_terminated_length": 267.96875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.3226700542787851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.007648636928934138, + "learning_rate": 2.414e-06, + "loss": 0.0003, + "num_tokens": 64885108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.32278554105554913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.009247736838005949, + "learning_rate": 2.412e-06, + "loss": 0.0004, + "num_tokens": 64908648.0, + "reward": 3.8895552158355713, + "reward_std": 0.30178773403167725, + "rewards/reward_fn/mean": 3.8895552158355713, + "rewards/reward_fn/std": 0.30178770422935486, + "step": 2795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 283.25, + "completions/mean_terminated_length": 283.25, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.3229010278323132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03466796875, + "kl": 0.007093796404660679, + "learning_rate": 2.4100000000000002e-06, + "loss": 0.0003, + "num_tokens": 64927728.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 83.34375, + "completions/mean_terminated_length": 83.34375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.32301651460907727, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.01177445323264692, + "learning_rate": 2.4079999999999996e-06, + "loss": 0.0005, + "num_tokens": 64948539.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 97.34375, + "completions/mean_terminated_length": 97.34375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.3231320013858413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.007266304823133396, + "learning_rate": 2.406e-06, + "loss": 0.0003, + "num_tokens": 64972326.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 347.0, + "completions/mean_terminated_length": 347.0, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3232474881626054, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.011966906153247692, + "learning_rate": 2.4039999999999998e-06, + "loss": 0.0005, + "num_tokens": 64995910.0, + "reward": 3.705845355987549, + "reward_std": 0.5952614545822144, + "rewards/reward_fn/mean": 3.705845355987549, + "rewards/reward_fn/std": 0.5952614545822144, + "step": 2799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 122.9375, + "completions/mean_terminated_length": 122.9375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.32336297493936944, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.01081373325723689, + "learning_rate": 2.402e-06, + "loss": 0.0004, + "num_tokens": 65018404.0, + "reward": 3.8728585243225098, + "reward_std": 0.3030933439731598, + "rewards/reward_fn/mean": 3.8728585243225098, + "rewards/reward_fn/std": 0.3030933141708374, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 117.8125, + "completions/mean_terminated_length": 117.8125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3234784617161335, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.00945870392388315, + "learning_rate": 2.4e-06, + "loss": 0.0004, + "num_tokens": 65049438.0, + "reward": 3.5684874057769775, + "reward_std": 0.3813121020793915, + "rewards/reward_fn/mean": 3.5684874057769775, + "rewards/reward_fn/std": 0.3813120424747467, + "step": 2801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 144.5, + "completions/mean_terminated_length": 144.5, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.3235939484928976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.008285518430056982, + "learning_rate": 2.398e-06, + "loss": 0.0003, + "num_tokens": 65080942.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 88.65625, + "completions/mean_terminated_length": 88.65625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3237094352696616, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.007276996613654774, + "learning_rate": 2.3959999999999996e-06, + "loss": 0.0003, + "num_tokens": 65109667.0, + "reward": 3.669325113296509, + "reward_std": 0.024551646783947945, + "rewards/reward_fn/mean": 3.669325113296509, + "rewards/reward_fn/std": 0.024551618844270706, + "step": 2803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 53.09375, + "completions/mean_terminated_length": 53.09375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.3238249220464257, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.71875, + "kl": 0.018640238791704178, + "learning_rate": 2.394e-06, + "loss": 0.0007, + "num_tokens": 65123590.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 2804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.0, + "completions/max_terminated_length": 100.0, + "completions/mean_length": 71.4375, + "completions/mean_terminated_length": 71.4375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.32394040882318975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2060546875, + "kl": 0.008494602394421236, + "learning_rate": 2.3919999999999997e-06, + "loss": 0.0003, + "num_tokens": 65154356.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 392.6875, + "completions/mean_terminated_length": 392.6875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3240558955999538, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.011420139970141463, + "learning_rate": 2.39e-06, + "loss": 0.0005, + "num_tokens": 65191978.0, + "reward": 3.8906590938568115, + "reward_std": 0.29783523082733154, + "rewards/reward_fn/mean": 3.8906590938568115, + "rewards/reward_fn/std": 0.29783523082733154, + "step": 2806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 94.5, + "completions/mean_terminated_length": 94.5, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.3241713823767179, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.007486872505978681, + "learning_rate": 2.388e-06, + "loss": 0.0003, + "num_tokens": 65215802.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 342.15625, + "completions/mean_terminated_length": 342.15625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.3242868691534819, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.013547555092372932, + "learning_rate": 2.386e-06, + "loss": 0.0005, + "num_tokens": 65242687.0, + "reward": 3.930849552154541, + "reward_std": 0.39117431640625, + "rewards/reward_fn/mean": 3.930849552154541, + "rewards/reward_fn/std": 0.39117431640625, + "step": 2808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 191.625, + "completions/mean_terminated_length": 191.625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.32440235593024597, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.019061736165895127, + "learning_rate": 2.384e-06, + "loss": 0.0008, + "num_tokens": 65274131.0, + "reward": 3.561479091644287, + "reward_std": 0.4827370047569275, + "rewards/reward_fn/mean": 3.561479091644287, + "rewards/reward_fn/std": 0.4827370047569275, + "step": 2809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 56.25, + "completions/mean_terminated_length": 56.25, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.32451784270701006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.01335116559494054, + "learning_rate": 2.382e-06, + "loss": 0.0005, + "num_tokens": 65293915.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 187.96875, + "completions/mean_terminated_length": 187.96875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.3246333294837741, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.010697991063352674, + "learning_rate": 2.3799999999999997e-06, + "loss": 0.0004, + "num_tokens": 65314906.0, + "reward": 3.680807590484619, + "reward_std": 0.4485202133655548, + "rewards/reward_fn/mean": 3.680807590484619, + "rewards/reward_fn/std": 0.4485201835632324, + "step": 2811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 208.5, + "completions/mean_terminated_length": 208.5, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.32474881626053814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.027318571956129745, + "learning_rate": 2.378e-06, + "loss": 0.0011, + "num_tokens": 65346538.0, + "reward": 3.825892686843872, + "reward_std": 0.46288758516311646, + "rewards/reward_fn/mean": 3.825892686843872, + "rewards/reward_fn/std": 0.46288758516311646, + "step": 2812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 388.8125, + "completions/mean_terminated_length": 388.8125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.32486430303730224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.010838527174200863, + "learning_rate": 2.3759999999999998e-06, + "loss": 0.0004, + "num_tokens": 65382660.0, + "reward": 3.9356589317321777, + "reward_std": 0.2560058832168579, + "rewards/reward_fn/mean": 3.9356589317321777, + "rewards/reward_fn/std": 0.2560059130191803, + "step": 2813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 150.46875, + "completions/mean_terminated_length": 150.46875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.3249797898140663, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.013076754112262279, + "learning_rate": 2.374e-06, + "loss": 0.0005, + "num_tokens": 65410515.0, + "reward": 3.59260630607605, + "reward_std": 0.2850472629070282, + "rewards/reward_fn/mean": 3.59260630607605, + "rewards/reward_fn/std": 0.2850472629070282, + "step": 2814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 368.15625, + "completions/mean_terminated_length": 368.15625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.3250952765908304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.01336515297589358, + "learning_rate": 2.372e-06, + "loss": 0.0005, + "num_tokens": 65437432.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 77.0625, + "completions/mean_terminated_length": 77.0625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.3252107633675944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.0059585058897937415, + "learning_rate": 2.37e-06, + "loss": 0.0002, + "num_tokens": 65458714.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 147.40625, + "completions/mean_terminated_length": 147.40625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.32532625014435845, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.01240947734186193, + "learning_rate": 2.3679999999999996e-06, + "loss": 0.0005, + "num_tokens": 65479879.0, + "reward": 3.7942562103271484, + "reward_std": 0.590320348739624, + "rewards/reward_fn/mean": 3.7942562103271484, + "rewards/reward_fn/std": 0.590320348739624, + "step": 2817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 59.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 45.09375, + "completions/mean_terminated_length": 45.09375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.32544173692112255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.00879504744807491, + "learning_rate": 2.366e-06, + "loss": 0.0004, + "num_tokens": 65491402.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 213.625, + "completions/mean_terminated_length": 213.625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.3255572236978866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.009913572517689317, + "learning_rate": 2.3639999999999997e-06, + "loss": 0.0004, + "num_tokens": 65510366.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 199.25, + "completions/mean_terminated_length": 199.25, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.32567271047465063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.016679503518389538, + "learning_rate": 2.362e-06, + "loss": 0.0007, + "num_tokens": 65529510.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 205.875, + "completions/mean_terminated_length": 205.875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3257881972514147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.012882231312687509, + "learning_rate": 2.36e-06, + "loss": 0.0005, + "num_tokens": 65548002.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 179.0, + "completions/mean_terminated_length": 179.0, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.32590368402817876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.017213871818967164, + "learning_rate": 2.358e-06, + "loss": 0.0007, + "num_tokens": 65577794.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 62.96875, + "completions/mean_terminated_length": 62.96875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.32601917080494286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.006528388475999236, + "learning_rate": 2.356e-06, + "loss": 0.0003, + "num_tokens": 65589185.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 112.96875, + "completions/mean_terminated_length": 112.96875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3261346575817069, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.009047551240655594, + "learning_rate": 2.354e-06, + "loss": 0.0004, + "num_tokens": 65600896.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 93.96875, + "completions/mean_terminated_length": 93.96875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.32625014435847094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.004165706057392526, + "learning_rate": 2.3519999999999997e-06, + "loss": 0.0002, + "num_tokens": 65622175.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 180.125, + "completions/mean_terminated_length": 180.125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.32636563113523503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042724609375, + "kl": 0.006797090296458919, + "learning_rate": 2.35e-06, + "loss": 0.0003, + "num_tokens": 65651619.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 229.8125, + "completions/mean_terminated_length": 229.8125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.3264811179119991, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.013571443894761615, + "learning_rate": 2.348e-06, + "loss": 0.0005, + "num_tokens": 65679869.0, + "reward": 3.875427722930908, + "reward_std": 0.27152708172798157, + "rewards/reward_fn/mean": 3.875427722930908, + "rewards/reward_fn/std": 0.2715270519256592, + "step": 2827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 94.65625, + "completions/mean_terminated_length": 94.65625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3265966046887631, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.006982167091337033, + "learning_rate": 2.346e-06, + "loss": 0.0003, + "num_tokens": 65695250.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 233.15625, + "completions/mean_terminated_length": 233.15625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.3267120914655272, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.008321078625158407, + "learning_rate": 2.344e-06, + "loss": 0.0003, + "num_tokens": 65720951.0, + "reward": 2.5118589401245117, + "reward_std": 0.9869444966316223, + "rewards/reward_fn/mean": 2.5118589401245117, + "rewards/reward_fn/std": 0.9869444966316223, + "step": 2829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 346.375, + "completions/mean_terminated_length": 346.375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.32682757824229125, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.012400950843584724, + "learning_rate": 2.3419999999999998e-06, + "loss": 0.0005, + "num_tokens": 65752483.0, + "reward": 3.6481423377990723, + "reward_std": 0.5794773697853088, + "rewards/reward_fn/mean": 3.6481423377990723, + "rewards/reward_fn/std": 0.5794773101806641, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 192.09375, + "completions/mean_terminated_length": 192.09375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.32694306501905535, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.015126500788028352, + "learning_rate": 2.3399999999999996e-06, + "loss": 0.0006, + "num_tokens": 65771270.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 128.625, + "completions/mean_terminated_length": 128.625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.3270585517958194, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.007007556469034171, + "learning_rate": 2.338e-06, + "loss": 0.0003, + "num_tokens": 65799514.0, + "reward": 3.966529607772827, + "reward_std": 0.18933752179145813, + "rewards/reward_fn/mean": 3.966529607772827, + "rewards/reward_fn/std": 0.18933755159378052, + "step": 2832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 88.53125, + "completions/mean_terminated_length": 88.53125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.3271740385725834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.013592646726465318, + "learning_rate": 2.3359999999999997e-06, + "loss": 0.0005, + "num_tokens": 65820459.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 284.4375, + "completions/mean_terminated_length": 284.4375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.3272895253493475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0380859375, + "kl": 0.0074843991387751885, + "learning_rate": 2.334e-06, + "loss": 0.0003, + "num_tokens": 65848793.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 87.75, + "completions/mean_terminated_length": 87.75, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.32740501212611156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.007533333642641082, + "learning_rate": 2.332e-06, + "loss": 0.0003, + "num_tokens": 65863729.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 164.09375, + "completions/mean_terminated_length": 164.09375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.3275204989028756, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.020084393298020586, + "learning_rate": 2.33e-06, + "loss": 0.0008, + "num_tokens": 65892020.0, + "reward": 3.692831516265869, + "reward_std": 0.4669915437698364, + "rewards/reward_fn/mean": 3.692831516265869, + "rewards/reward_fn/std": 0.46699148416519165, + "step": 2836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 224.84375, + "completions/mean_terminated_length": 224.84375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.3276359856796397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03955078125, + "kl": 0.007623307115864009, + "learning_rate": 2.3279999999999996e-06, + "loss": 0.0003, + "num_tokens": 65908847.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 151.78125, + "completions/mean_terminated_length": 151.78125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.32775147245640374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.006586759598576464, + "learning_rate": 2.326e-06, + "loss": 0.0003, + "num_tokens": 65933288.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 140.03125, + "completions/mean_terminated_length": 140.03125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.3278669592331678, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.00978389555530157, + "learning_rate": 2.3239999999999997e-06, + "loss": 0.0004, + "num_tokens": 65962889.0, + "reward": 3.9858880043029785, + "reward_std": 0.07982943952083588, + "rewards/reward_fn/mean": 3.9858880043029785, + "rewards/reward_fn/std": 0.07982941716909409, + "step": 2839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 291.40625, + "completions/mean_terminated_length": 291.40625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.32798244600993187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.01000847714021802, + "learning_rate": 2.322e-06, + "loss": 0.0004, + "num_tokens": 65991478.0, + "reward": 3.8452463150024414, + "reward_std": 0.41646265983581543, + "rewards/reward_fn/mean": 3.8452463150024414, + "rewards/reward_fn/std": 0.41646265983581543, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 285.84375, + "completions/mean_terminated_length": 285.84375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.3280979327866959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037353515625, + "kl": 0.006465459955506958, + "learning_rate": 2.32e-06, + "loss": 0.0003, + "num_tokens": 66016273.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 453.40625, + "completions/mean_terminated_length": 453.40625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.32821341956346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.00988433655584231, + "learning_rate": 2.318e-06, + "loss": 0.0004, + "num_tokens": 66043390.0, + "reward": 3.8014633655548096, + "reward_std": 0.8087443709373474, + "rewards/reward_fn/mean": 3.8014633655548096, + "rewards/reward_fn/std": 0.8087443113327026, + "step": 2842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 82.1875, + "completions/mean_terminated_length": 82.1875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.32832890634022405, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.484375, + "kl": 0.009763631231180625, + "learning_rate": 2.316e-06, + "loss": 0.0004, + "num_tokens": 66064004.0, + "reward": 3.8872244358062744, + "reward_std": 0.3562527298927307, + "rewards/reward_fn/mean": 3.8872244358062744, + "rewards/reward_fn/std": 0.3562527596950531, + "step": 2843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 93.09375, + "completions/mean_terminated_length": 93.09375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.3284443931169881, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.013127586724294815, + "learning_rate": 2.3139999999999998e-06, + "loss": 0.0005, + "num_tokens": 66079591.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 246.71875, + "completions/mean_terminated_length": 246.71875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.3285598798937522, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.00930511012848001, + "learning_rate": 2.3119999999999996e-06, + "loss": 0.0004, + "num_tokens": 66100670.0, + "reward": 3.8637661933898926, + "reward_std": 0.5361736416816711, + "rewards/reward_fn/mean": 3.8637661933898926, + "rewards/reward_fn/std": 0.5361736416816711, + "step": 2845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 146.21875, + "completions/mean_terminated_length": 146.21875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.3286753666705162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.01066348263702821, + "learning_rate": 2.31e-06, + "loss": 0.0004, + "num_tokens": 66117733.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 238.6875, + "completions/mean_terminated_length": 238.6875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.32879085344728026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.014997999300248921, + "learning_rate": 2.3079999999999998e-06, + "loss": 0.0006, + "num_tokens": 66143035.0, + "reward": 3.2666540145874023, + "reward_std": 0.36152881383895874, + "rewards/reward_fn/mean": 3.2666540145874023, + "rewards/reward_fn/std": 0.36152884364128113, + "step": 2847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 178.625, + "completions/mean_terminated_length": 178.625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.32890634022404436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.009349968611786608, + "learning_rate": 2.306e-06, + "loss": 0.0004, + "num_tokens": 66161199.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 134.46875, + "completions/mean_terminated_length": 134.46875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.3290218270008084, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.012123259031795897, + "learning_rate": 2.304e-06, + "loss": 0.0005, + "num_tokens": 66181534.0, + "reward": 3.340648651123047, + "reward_std": 0.29935047030448914, + "rewards/reward_fn/mean": 3.340648651123047, + "rewards/reward_fn/std": 0.29935047030448914, + "step": 2849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 99.03125, + "completions/mean_terminated_length": 99.03125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3291373137775725, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.014583026073523797, + "learning_rate": 2.302e-06, + "loss": 0.0006, + "num_tokens": 66206783.0, + "reward": 3.4328532218933105, + "reward_std": 0.043398115783929825, + "rewards/reward_fn/mean": 3.4328532218933105, + "rewards/reward_fn/std": 0.043398141860961914, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 203.15625, + "completions/mean_terminated_length": 203.15625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.32925280055433653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.014088726180489175, + "learning_rate": 2.2999999999999996e-06, + "loss": 0.0006, + "num_tokens": 66241892.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 233.53125, + "completions/mean_terminated_length": 233.53125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.3293682873311006, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.009861567115876824, + "learning_rate": 2.298e-06, + "loss": 0.0004, + "num_tokens": 66278261.0, + "reward": 3.941943645477295, + "reward_std": 0.22873631119728088, + "rewards/reward_fn/mean": 3.941943645477295, + "rewards/reward_fn/std": 0.22873632609844208, + "step": 2852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 228.15625, + "completions/mean_terminated_length": 228.15625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.32948377410786467, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.012979445833479986, + "learning_rate": 2.2959999999999997e-06, + "loss": 0.0005, + "num_tokens": 66299546.0, + "reward": 3.474606990814209, + "reward_std": 0.7004019618034363, + "rewards/reward_fn/mean": 3.474606990814209, + "rewards/reward_fn/std": 0.7004019021987915, + "step": 2853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 184.5, + "completions/mean_terminated_length": 184.5, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.3295992608846287, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.007411323735141195, + "learning_rate": 2.294e-06, + "loss": 0.0003, + "num_tokens": 66322058.0, + "reward": 3.9405856132507324, + "reward_std": 0.23395287990570068, + "rewards/reward_fn/mean": 3.9405856132507324, + "rewards/reward_fn/std": 0.23395287990570068, + "step": 2854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 138.34375, + "completions/mean_terminated_length": 138.34375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.32971474766139275, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.009971756735467352, + "learning_rate": 2.292e-06, + "loss": 0.0004, + "num_tokens": 66339445.0, + "reward": 2.9333293437957764, + "reward_std": 0.026442868635058403, + "rewards/reward_fn/mean": 2.9333293437957764, + "rewards/reward_fn/std": 0.026442868635058403, + "step": 2855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 112.03125, + "completions/mean_terminated_length": 112.03125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.32983023443815684, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.00935578811186133, + "learning_rate": 2.29e-06, + "loss": 0.0004, + "num_tokens": 66356054.0, + "reward": 3.6230297088623047, + "reward_std": 0.11847502738237381, + "rewards/reward_fn/mean": 3.6230297088623047, + "rewards/reward_fn/std": 0.11847499012947083, + "step": 2856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 366.21875, + "completions/mean_terminated_length": 366.21875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.3299457212149209, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.00959819823037833, + "learning_rate": 2.2879999999999995e-06, + "loss": 0.0004, + "num_tokens": 66391901.0, + "reward": 3.8084874153137207, + "reward_std": 0.36878669261932373, + "rewards/reward_fn/mean": 3.8084874153137207, + "rewards/reward_fn/std": 0.36878663301467896, + "step": 2857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 57.6875, + "completions/mean_terminated_length": 57.6875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.330061207991685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.011203691305126995, + "learning_rate": 2.286e-06, + "loss": 0.0004, + "num_tokens": 66409395.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 189.4375, + "completions/mean_terminated_length": 189.4375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.330176694768449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038330078125, + "kl": 0.007659012786461972, + "learning_rate": 2.2839999999999996e-06, + "loss": 0.0003, + "num_tokens": 66427489.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 363.6875, + "completions/mean_terminated_length": 363.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.33029218154521306, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.00905712389794644, + "learning_rate": 2.282e-06, + "loss": 0.0004, + "num_tokens": 66454871.0, + "reward": 3.928370952606201, + "reward_std": 0.4051942825317383, + "rewards/reward_fn/mean": 3.928370952606201, + "rewards/reward_fn/std": 0.4051942825317383, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 139.75, + "completions/mean_terminated_length": 139.75, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.33040766832197715, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.012055869927280582, + "learning_rate": 2.2799999999999998e-06, + "loss": 0.0005, + "num_tokens": 66467183.0, + "reward": 3.956573486328125, + "reward_std": 0.1717831939458847, + "rewards/reward_fn/mean": 3.956573486328125, + "rewards/reward_fn/std": 0.17178316414356232, + "step": 2861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 133.0625, + "completions/mean_terminated_length": 133.0625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.3305231550987412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038330078125, + "kl": 0.0036785558932024287, + "learning_rate": 2.278e-06, + "loss": 0.0001, + "num_tokens": 66493745.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 82.65625, + "completions/mean_terminated_length": 82.65625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.33063864187550523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.0036222520047886064, + "learning_rate": 2.276e-06, + "loss": 0.0001, + "num_tokens": 66520966.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 252.34375, + "completions/mean_terminated_length": 252.34375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.33075412865226933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.01303309948707465, + "learning_rate": 2.2739999999999997e-06, + "loss": 0.0005, + "num_tokens": 66541425.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 260.71875, + "completions/mean_terminated_length": 260.71875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.33086961542903337, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.005888593463168945, + "learning_rate": 2.2719999999999996e-06, + "loss": 0.0002, + "num_tokens": 66565320.0, + "reward": 3.9324071407318115, + "reward_std": 0.38236263394355774, + "rewards/reward_fn/mean": 3.9324071407318115, + "rewards/reward_fn/std": 0.38236263394355774, + "step": 2865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 182.59375, + "completions/mean_terminated_length": 182.59375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.3309851022057974, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.024696928303455934, + "learning_rate": 2.27e-06, + "loss": 0.001, + "num_tokens": 66597243.0, + "reward": 3.861447334289551, + "reward_std": 0.5452405214309692, + "rewards/reward_fn/mean": 3.861447334289551, + "rewards/reward_fn/std": 0.5452405214309692, + "step": 2866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 123.125, + "completions/mean_terminated_length": 123.125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.3311005889825615, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.008583347087551374, + "learning_rate": 2.2679999999999997e-06, + "loss": 0.0003, + "num_tokens": 66626367.0, + "reward": 3.9027693271636963, + "reward_std": 0.4256194233894348, + "rewards/reward_fn/mean": 3.9027693271636963, + "rewards/reward_fn/std": 0.42561936378479004, + "step": 2867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 112.90625, + "completions/mean_terminated_length": 112.90625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.33121607575932555, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.0070166936129680835, + "learning_rate": 2.266e-06, + "loss": 0.0003, + "num_tokens": 66642812.0, + "reward": 3.929549217224121, + "reward_std": 0.3985289931297302, + "rewards/reward_fn/mean": 3.929549217224121, + "rewards/reward_fn/std": 0.398529052734375, + "step": 2868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 130.4375, + "completions/mean_terminated_length": 130.4375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.33133156253608964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.005200020634219982, + "learning_rate": 2.264e-06, + "loss": 0.0002, + "num_tokens": 66669034.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 234.34375, + "completions/mean_terminated_length": 234.34375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.3314470493128537, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.010830000966961961, + "learning_rate": 2.262e-06, + "loss": 0.0004, + "num_tokens": 66692213.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 399.5625, + "completions/mean_terminated_length": 399.5625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.3315625360896177, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.010577195062069222, + "learning_rate": 2.2599999999999995e-06, + "loss": 0.0004, + "num_tokens": 66716999.0, + "reward": 3.4226744174957275, + "reward_std": 1.0161675214767456, + "rewards/reward_fn/mean": 3.4226744174957275, + "rewards/reward_fn/std": 1.0161675214767456, + "step": 2871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 194.40625, + "completions/mean_terminated_length": 194.40625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.3316780228663818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.009538329613860697, + "learning_rate": 2.258e-06, + "loss": 0.0004, + "num_tokens": 66742580.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 102.75, + "completions/mean_terminated_length": 102.75, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.33179350964314586, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.009169280005153269, + "learning_rate": 2.2559999999999997e-06, + "loss": 0.0004, + "num_tokens": 66759340.0, + "reward": 3.414888381958008, + "reward_std": 0.04144783318042755, + "rewards/reward_fn/mean": 3.414888381958008, + "rewards/reward_fn/std": 0.041447851806879044, + "step": 2873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 142.40625, + "completions/mean_terminated_length": 142.40625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.3319089964199099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.006995716565143084, + "learning_rate": 2.254e-06, + "loss": 0.0003, + "num_tokens": 66773561.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 146.1875, + "completions/mean_terminated_length": 146.1875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.332024483196674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05712890625, + "kl": 0.009091046122193802, + "learning_rate": 2.2519999999999998e-06, + "loss": 0.0004, + "num_tokens": 66801951.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 141.0, + "completions/mean_terminated_length": 141.0, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.33213996997343803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.010577811692201067, + "learning_rate": 2.25e-06, + "loss": 0.0004, + "num_tokens": 66816319.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 112.59375, + "completions/mean_terminated_length": 112.59375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.3322554567502021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.005094734333397355, + "learning_rate": 2.2480000000000003e-06, + "loss": 0.0002, + "num_tokens": 66846610.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 125.78125, + "completions/mean_terminated_length": 125.78125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.33237094352696617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.009334123613371048, + "learning_rate": 2.2459999999999998e-06, + "loss": 0.0004, + "num_tokens": 66871179.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 110.625, + "completions/mean_terminated_length": 110.625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.3324864303037302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049072265625, + "kl": 0.005027088435781479, + "learning_rate": 2.244e-06, + "loss": 0.0002, + "num_tokens": 66902015.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 132.71875, + "completions/mean_terminated_length": 132.71875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.3326019170804943, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.008497406590322498, + "learning_rate": 2.242e-06, + "loss": 0.0003, + "num_tokens": 66918486.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 209.71875, + "completions/mean_terminated_length": 209.71875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.33271740385725834, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.01886670103704091, + "learning_rate": 2.24e-06, + "loss": 0.0008, + "num_tokens": 66948525.0, + "reward": 3.9791581630706787, + "reward_std": 0.1178991049528122, + "rewards/reward_fn/mean": 3.9791581630706787, + "rewards/reward_fn/std": 0.1178991049528122, + "step": 2881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 127.28125, + "completions/mean_terminated_length": 127.28125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.3328328906340224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.009960984541976359, + "learning_rate": 2.238e-06, + "loss": 0.0004, + "num_tokens": 66979446.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 190.40625, + "completions/mean_terminated_length": 190.40625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.3329483774107865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036376953125, + "kl": 0.006875987695821095, + "learning_rate": 2.2360000000000003e-06, + "loss": 0.0003, + "num_tokens": 66999459.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1113.0, + "completions/max_terminated_length": 1113.0, + "completions/mean_length": 427.65625, + "completions/mean_terminated_length": 427.65625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.3330638641875505, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.008312888006912544, + "learning_rate": 2.2339999999999997e-06, + "loss": 0.0003, + "num_tokens": 67031608.0, + "reward": 2.7185633182525635, + "reward_std": 0.47131162881851196, + "rewards/reward_fn/mean": 2.7185633182525635, + "rewards/reward_fn/std": 0.4713115990161896, + "step": 2884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 239.6875, + "completions/mean_terminated_length": 239.6875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.3331793509643146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.007994112733285874, + "learning_rate": 2.232e-06, + "loss": 0.0003, + "num_tokens": 67054606.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 338.09375, + "completions/mean_terminated_length": 338.09375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.33329483774107865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638671875, + "kl": 0.009512866963632405, + "learning_rate": 2.23e-06, + "loss": 0.0004, + "num_tokens": 67080401.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 90.1875, + "completions/mean_terminated_length": 90.1875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.3334103245178427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.008693438012414845, + "learning_rate": 2.228e-06, + "loss": 0.0003, + "num_tokens": 67107895.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 337.8125, + "completions/mean_terminated_length": 337.8125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.3335258112946068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73828125, + "kl": 0.008162364203599282, + "learning_rate": 2.226e-06, + "loss": 0.0003, + "num_tokens": 67131057.0, + "reward": 3.9299850463867188, + "reward_std": 0.3960638642311096, + "rewards/reward_fn/mean": 3.9299850463867188, + "rewards/reward_fn/std": 0.3960638642311096, + "step": 2888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 186.59375, + "completions/mean_terminated_length": 186.59375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.33364129807137083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.0071841121462057345, + "learning_rate": 2.2240000000000002e-06, + "loss": 0.0003, + "num_tokens": 67151876.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 97.15625, + "completions/mean_terminated_length": 97.15625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.33375678484813487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038818359375, + "kl": 0.0034335435866523767, + "learning_rate": 2.222e-06, + "loss": 0.0001, + "num_tokens": 67175465.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 151.84375, + "completions/mean_terminated_length": 151.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.33387227162489896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05126953125, + "kl": 0.007278746234078426, + "learning_rate": 2.22e-06, + "loss": 0.0003, + "num_tokens": 67210756.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 106.1875, + "completions/mean_terminated_length": 106.1875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.333987758401663, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.00617255049655796, + "learning_rate": 2.2179999999999998e-06, + "loss": 0.0002, + "num_tokens": 67234794.0, + "reward": 3.978801965713501, + "reward_std": 0.11991388350725174, + "rewards/reward_fn/mean": 3.978801965713501, + "rewards/reward_fn/std": 0.11991389095783234, + "step": 2892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 144.46875, + "completions/mean_terminated_length": 144.46875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.33410324517842704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.009955407942470629, + "learning_rate": 2.216e-06, + "loss": 0.0004, + "num_tokens": 67261721.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 223.4375, + "completions/mean_terminated_length": 223.4375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.33421873195519114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.007473944984667469, + "learning_rate": 2.214e-06, + "loss": 0.0003, + "num_tokens": 67291239.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 202.34375, + "completions/mean_terminated_length": 202.34375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.3343342187319552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.008437538737780415, + "learning_rate": 2.212e-06, + "loss": 0.0003, + "num_tokens": 67323314.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 110.875, + "completions/mean_terminated_length": 110.875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3344497055087193, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.012952314857102465, + "learning_rate": 2.21e-06, + "loss": 0.0005, + "num_tokens": 67348206.0, + "reward": 3.7930569648742676, + "reward_std": 0.39757081866264343, + "rewards/reward_fn/mean": 3.7930569648742676, + "rewards/reward_fn/std": 0.39757078886032104, + "step": 2896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 154.9375, + "completions/mean_terminated_length": 154.9375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.3345651922854833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.010843448675586842, + "learning_rate": 2.2080000000000003e-06, + "loss": 0.0004, + "num_tokens": 67366092.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 72.625, + "completions/mean_terminated_length": 72.625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.33468067906224735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.008599572101957165, + "learning_rate": 2.2059999999999997e-06, + "loss": 0.0003, + "num_tokens": 67384192.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 97.875, + "completions/mean_terminated_length": 97.875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.33479616583901145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.004332105128924013, + "learning_rate": 2.204e-06, + "loss": 0.0002, + "num_tokens": 67413884.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 85.5, + "completions/mean_terminated_length": 85.5, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.3349116526157755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.00360382729468256, + "learning_rate": 2.202e-06, + "loss": 0.0001, + "num_tokens": 67438220.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1135.0, + "completions/max_terminated_length": 1135.0, + "completions/mean_length": 328.34375, + "completions/mean_terminated_length": 328.34375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.33502713939253953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.010867657489143312, + "learning_rate": 2.2e-06, + "loss": 0.0004, + "num_tokens": 67461047.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 118.84375, + "completions/mean_terminated_length": 118.84375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.3351426261693036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03369140625, + "kl": 0.003787126365750737, + "learning_rate": 2.198e-06, + "loss": 0.0002, + "num_tokens": 67486418.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 174.40625, + "completions/mean_terminated_length": 174.40625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.33525811294606767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.007571999500214588, + "learning_rate": 2.1960000000000002e-06, + "loss": 0.0003, + "num_tokens": 67504703.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 92.8125, + "completions/mean_terminated_length": 92.8125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.33537359972283176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.005266466974717332, + "learning_rate": 2.1939999999999997e-06, + "loss": 0.0002, + "num_tokens": 67525177.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 92.40625, + "completions/mean_terminated_length": 92.40625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.3354890864995958, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.00714192223676946, + "learning_rate": 2.192e-06, + "loss": 0.0003, + "num_tokens": 67537766.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 162.28125, + "completions/mean_terminated_length": 162.28125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.33560457327635984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.010072743651107885, + "learning_rate": 2.1899999999999998e-06, + "loss": 0.0004, + "num_tokens": 67554575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 134.78125, + "completions/mean_terminated_length": 134.78125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.33572006005312394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.006249014903005445, + "learning_rate": 2.188e-06, + "loss": 0.0002, + "num_tokens": 67581608.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 342.9375, + "completions/mean_terminated_length": 342.9375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.335835546829888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.010144096551812254, + "learning_rate": 2.186e-06, + "loss": 0.0004, + "num_tokens": 67604518.0, + "reward": 3.9164624214172363, + "reward_std": 0.2712416350841522, + "rewards/reward_fn/mean": 3.9164624214172363, + "rewards/reward_fn/std": 0.2712416648864746, + "step": 2908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 117.78125, + "completions/mean_terminated_length": 117.78125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.335951033606652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.008593074868258554, + "learning_rate": 2.184e-06, + "loss": 0.0003, + "num_tokens": 67628799.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 227.96875, + "completions/mean_terminated_length": 227.96875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.3360665203834161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.00804114859784022, + "learning_rate": 2.182e-06, + "loss": 0.0003, + "num_tokens": 67654814.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 141.6875, + "completions/mean_terminated_length": 141.6875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.33618200716018015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.014830627034825739, + "learning_rate": 2.18e-06, + "loss": 0.0006, + "num_tokens": 67684148.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 250.65625, + "completions/mean_terminated_length": 250.65625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.33629749393694425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.009352183478767984, + "learning_rate": 2.1779999999999997e-06, + "loss": 0.0004, + "num_tokens": 67707785.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 138.3125, + "completions/mean_terminated_length": 138.3125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.3364129807137083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.011054764749133028, + "learning_rate": 2.176e-06, + "loss": 0.0004, + "num_tokens": 67728339.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 134.84375, + "completions/mean_terminated_length": 134.84375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.3365284674904723, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.007877742711571045, + "learning_rate": 2.174e-06, + "loss": 0.0003, + "num_tokens": 67755630.0, + "reward": 3.3622584342956543, + "reward_std": 0.09921810775995255, + "rewards/reward_fn/mean": 3.3622584342956543, + "rewards/reward_fn/std": 0.09921809285879135, + "step": 2914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 157.09375, + "completions/mean_terminated_length": 157.09375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.3366439542672364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.00925395136437146, + "learning_rate": 2.172e-06, + "loss": 0.0004, + "num_tokens": 67776465.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 146.78125, + "completions/mean_terminated_length": 146.78125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.33675944104400046, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.01649203844135627, + "learning_rate": 2.17e-06, + "loss": 0.0007, + "num_tokens": 67790634.0, + "reward": 3.321286678314209, + "reward_std": 0.3617815375328064, + "rewards/reward_fn/mean": 3.321286678314209, + "rewards/reward_fn/std": 0.361781507730484, + "step": 2916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 220.75, + "completions/mean_terminated_length": 220.75, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.3368749278207645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.006122719387349207, + "learning_rate": 2.1680000000000002e-06, + "loss": 0.0002, + "num_tokens": 67820770.0, + "reward": 3.3056135177612305, + "reward_std": 0.3437388241291046, + "rewards/reward_fn/mean": 3.3056135177612305, + "rewards/reward_fn/std": 0.343738853931427, + "step": 2917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.3369904145975286, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.019693683745572343, + "learning_rate": 2.1659999999999997e-06, + "loss": 0.0008, + "num_tokens": 67841998.0, + "reward": 3.930920124053955, + "reward_std": 0.3907739818096161, + "rewards/reward_fn/mean": 3.930920124053955, + "rewards/reward_fn/std": 0.3907739818096161, + "step": 2918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 197.46875, + "completions/mean_terminated_length": 197.46875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.33710590137429264, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.015326411885325797, + "learning_rate": 2.164e-06, + "loss": 0.0006, + "num_tokens": 67866461.0, + "reward": 3.8554224967956543, + "reward_std": 0.30882346630096436, + "rewards/reward_fn/mean": 3.8554224967956543, + "rewards/reward_fn/std": 0.30882349610328674, + "step": 2919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 139.28125, + "completions/mean_terminated_length": 139.28125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.3372213881510567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.008324527600052534, + "learning_rate": 2.162e-06, + "loss": 0.0003, + "num_tokens": 67882502.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 79.09375, + "completions/mean_terminated_length": 79.09375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.3373368749278208, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296875, + "kl": 0.002980079203553032, + "learning_rate": 2.16e-06, + "loss": 0.0001, + "num_tokens": 67900009.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 305.53125, + "completions/mean_terminated_length": 305.53125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.3374523617045848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.010134892705536913, + "learning_rate": 2.158e-06, + "loss": 0.0004, + "num_tokens": 67924762.0, + "reward": 3.625330924987793, + "reward_std": 0.5510618090629578, + "rewards/reward_fn/mean": 3.625330924987793, + "rewards/reward_fn/std": 0.5510618686676025, + "step": 2922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 99.84375, + "completions/mean_terminated_length": 99.84375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.3375678484813489, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.008214601803047117, + "learning_rate": 2.156e-06, + "loss": 0.0003, + "num_tokens": 67941685.0, + "reward": 3.9675028324127197, + "reward_std": 0.183831587433815, + "rewards/reward_fn/mean": 3.9675028324127197, + "rewards/reward_fn/std": 0.1838315725326538, + "step": 2923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 143.9375, + "completions/mean_terminated_length": 143.9375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.33768333525811295, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.016404667781898752, + "learning_rate": 2.1539999999999996e-06, + "loss": 0.0007, + "num_tokens": 67954387.0, + "reward": 3.5966525077819824, + "reward_std": 0.18543830513954163, + "rewards/reward_fn/mean": 3.5966525077819824, + "rewards/reward_fn/std": 0.18543832004070282, + "step": 2924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 72.03125, + "completions/mean_terminated_length": 72.03125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.337798822034877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.009407746485521784, + "learning_rate": 2.152e-06, + "loss": 0.0004, + "num_tokens": 67979604.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 219.6875, + "completions/mean_terminated_length": 219.6875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.3379143088116411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.314453125, + "kl": 0.013612426031613722, + "learning_rate": 2.1499999999999997e-06, + "loss": 0.0005, + "num_tokens": 67997706.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 194.625, + "completions/mean_terminated_length": 194.625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.3380297955884051, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.007962761963426601, + "learning_rate": 2.148e-06, + "loss": 0.0003, + "num_tokens": 68025758.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 195.5625, + "completions/mean_terminated_length": 195.5625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.33814528236516916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.02462706173537299, + "learning_rate": 2.146e-06, + "loss": 0.001, + "num_tokens": 68054160.0, + "reward": 3.6877119541168213, + "reward_std": 0.6281372308731079, + "rewards/reward_fn/mean": 3.6877119541168213, + "rewards/reward_fn/std": 0.6281372904777527, + "step": 2928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 278.15625, + "completions/mean_terminated_length": 278.15625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.33826076914193326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.006007479856634745, + "learning_rate": 2.144e-06, + "loss": 0.0002, + "num_tokens": 68078421.0, + "reward": 3.9275689125061035, + "reward_std": 0.40973228216171265, + "rewards/reward_fn/mean": 3.9275689125061035, + "rewards/reward_fn/std": 0.40973231196403503, + "step": 2929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 97.96875, + "completions/mean_terminated_length": 97.96875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.3383762559186973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.010980077735439409, + "learning_rate": 2.142e-06, + "loss": 0.0004, + "num_tokens": 68106004.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 200.125, + "completions/mean_terminated_length": 200.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.3384917426954614, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.007746612092887517, + "learning_rate": 2.14e-06, + "loss": 0.0003, + "num_tokens": 68128472.0, + "reward": 3.976165771484375, + "reward_std": 0.13482621312141418, + "rewards/reward_fn/mean": 3.976165771484375, + "rewards/reward_fn/std": 0.13482621312141418, + "step": 2931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 303.15625, + "completions/mean_terminated_length": 303.15625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.33860722947222543, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.01359421898087021, + "learning_rate": 2.1379999999999997e-06, + "loss": 0.0005, + "num_tokens": 68155773.0, + "reward": 2.9703612327575684, + "reward_std": 0.04852040112018585, + "rewards/reward_fn/mean": 2.9703612327575684, + "rewards/reward_fn/std": 0.04852040484547615, + "step": 2932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 137.78125, + "completions/mean_terminated_length": 137.78125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3387227162489895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.011345089929818641, + "learning_rate": 2.136e-06, + "loss": 0.0005, + "num_tokens": 68172470.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 156.84375, + "completions/mean_terminated_length": 156.84375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.33883820302575357, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.00998850931864581, + "learning_rate": 2.134e-06, + "loss": 0.0004, + "num_tokens": 68195281.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 214.96875, + "completions/mean_terminated_length": 214.96875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.3389536898025176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.008019886299734935, + "learning_rate": 2.132e-06, + "loss": 0.0003, + "num_tokens": 68220656.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 126.84375, + "completions/mean_terminated_length": 126.84375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.33906917657928165, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.008633212382846978, + "learning_rate": 2.13e-06, + "loss": 0.0003, + "num_tokens": 68243659.0, + "reward": 3.928901195526123, + "reward_std": 0.4021947681903839, + "rewards/reward_fn/mean": 3.928901195526123, + "rewards/reward_fn/std": 0.40219470858573914, + "step": 2936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 185.4375, + "completions/mean_terminated_length": 185.4375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.33918466335604575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.0062564163890783675, + "learning_rate": 2.128e-06, + "loss": 0.0003, + "num_tokens": 68274681.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3393001501328098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060791015625, + "kl": 0.011168668701429851, + "learning_rate": 2.1259999999999996e-06, + "loss": 0.0004, + "num_tokens": 68293421.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 203.40625, + "completions/mean_terminated_length": 203.40625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.3394156369095739, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.00828775433183182, + "learning_rate": 2.124e-06, + "loss": 0.0003, + "num_tokens": 68315258.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 281.21875, + "completions/mean_terminated_length": 281.21875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.3395311236863379, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.006879731750814244, + "learning_rate": 2.1219999999999998e-06, + "loss": 0.0003, + "num_tokens": 68340353.0, + "reward": 3.8569042682647705, + "reward_std": 0.5630932450294495, + "rewards/reward_fn/mean": 3.8569042682647705, + "rewards/reward_fn/std": 0.5630931854248047, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 69.125, + "completions/mean_terminated_length": 69.125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.33964661046310196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.005606758425528824, + "learning_rate": 2.12e-06, + "loss": 0.0002, + "num_tokens": 68359685.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 143.96875, + "completions/mean_terminated_length": 143.96875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.33976209723986606, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.012369232841592748, + "learning_rate": 2.118e-06, + "loss": 0.0005, + "num_tokens": 68376644.0, + "reward": 3.9328956604003906, + "reward_std": 0.2640608251094818, + "rewards/reward_fn/mean": 3.9328956604003906, + "rewards/reward_fn/std": 0.2640608251094818, + "step": 2942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 96.375, + "completions/mean_terminated_length": 96.375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.3398775840166301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.005716824856790481, + "learning_rate": 2.116e-06, + "loss": 0.0002, + "num_tokens": 68405200.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 99.875, + "completions/mean_terminated_length": 99.875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.33999307079339414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.0036233050141163403, + "learning_rate": 2.1139999999999996e-06, + "loss": 0.0001, + "num_tokens": 68428332.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 86.375, + "completions/mean_terminated_length": 86.375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.34010855757015823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.006754925492714392, + "learning_rate": 2.112e-06, + "loss": 0.0003, + "num_tokens": 68443960.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 252.71875, + "completions/mean_terminated_length": 252.71875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.34022404434692227, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.010301897324097808, + "learning_rate": 2.1099999999999997e-06, + "loss": 0.0004, + "num_tokens": 68470671.0, + "reward": 3.9041786193847656, + "reward_std": 0.30713900923728943, + "rewards/reward_fn/mean": 3.9041786193847656, + "rewards/reward_fn/std": 0.30713900923728943, + "step": 2946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 201.28125, + "completions/mean_terminated_length": 201.28125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.3403395311236863, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.01065694222052116, + "learning_rate": 2.108e-06, + "loss": 0.0004, + "num_tokens": 68498296.0, + "reward": 3.9303693771362305, + "reward_std": 0.3938901126384735, + "rewards/reward_fn/mean": 3.9303693771362305, + "rewards/reward_fn/std": 0.3938901126384735, + "step": 2947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 222.21875, + "completions/mean_terminated_length": 222.21875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.3404550179004504, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.016106663024402224, + "learning_rate": 2.106e-06, + "loss": 0.0006, + "num_tokens": 68523711.0, + "reward": 3.8778538703918457, + "reward_std": 0.44220998883247375, + "rewards/reward_fn/mean": 3.8778538703918457, + "rewards/reward_fn/std": 0.44220995903015137, + "step": 2948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 392.59375, + "completions/mean_terminated_length": 392.59375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.34057050467721445, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.0073740799925872125, + "learning_rate": 2.104e-06, + "loss": 0.0003, + "num_tokens": 68551410.0, + "reward": 2.8275203704833984, + "reward_std": 0.1233314722776413, + "rewards/reward_fn/mean": 2.8275203704833984, + "rewards/reward_fn/std": 0.1233314648270607, + "step": 2949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 81.375, + "completions/mean_terminated_length": 81.375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.34068599145397854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1748046875, + "kl": 0.013144818527507596, + "learning_rate": 2.102e-06, + "loss": 0.0005, + "num_tokens": 68579774.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 117.34375, + "completions/mean_terminated_length": 117.34375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.3408014782307426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.014571136678569019, + "learning_rate": 2.1e-06, + "loss": 0.0006, + "num_tokens": 68596361.0, + "reward": 3.5985605716705322, + "reward_std": 0.21468116343021393, + "rewards/reward_fn/mean": 3.5985605716705322, + "rewards/reward_fn/std": 0.21468119323253632, + "step": 2951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 94.46875, + "completions/mean_terminated_length": 94.46875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.3409169650075066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.006856291329313535, + "learning_rate": 2.0979999999999996e-06, + "loss": 0.0003, + "num_tokens": 68621784.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 178.625, + "completions/mean_terminated_length": 178.625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.3410324517842707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.013830621537636034, + "learning_rate": 2.096e-06, + "loss": 0.0006, + "num_tokens": 68639788.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 353.9375, + "completions/mean_terminated_length": 353.9375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.34114793856103476, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.014289319777162746, + "learning_rate": 2.0939999999999998e-06, + "loss": 0.0006, + "num_tokens": 68673386.0, + "reward": 2.971315383911133, + "reward_std": 0.33221128582954407, + "rewards/reward_fn/mean": 2.971315383911133, + "rewards/reward_fn/std": 0.33221128582954407, + "step": 2954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 287.84375, + "completions/mean_terminated_length": 287.84375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.3412634253377988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.010694768774555996, + "learning_rate": 2.092e-06, + "loss": 0.0004, + "num_tokens": 68695077.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 281.03125, + "completions/mean_terminated_length": 281.03125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.3413789121145629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.011163993243826553, + "learning_rate": 2.09e-06, + "loss": 0.0004, + "num_tokens": 68716326.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 111.90625, + "completions/mean_terminated_length": 111.90625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.34149439889132693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0419921875, + "kl": 0.005414395671323291, + "learning_rate": 2.088e-06, + "loss": 0.0002, + "num_tokens": 68733155.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 178.625, + "completions/mean_terminated_length": 178.625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.34160988566809103, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.008359766463399865, + "learning_rate": 2.0859999999999996e-06, + "loss": 0.0003, + "num_tokens": 68753015.0, + "reward": 3.8643546104431152, + "reward_std": 0.3211755156517029, + "rewards/reward_fn/mean": 3.8643546104431152, + "rewards/reward_fn/std": 0.32117554545402527, + "step": 2958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 275.125, + "completions/mean_terminated_length": 275.125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.34172537244485507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.009929955609550234, + "learning_rate": 2.084e-06, + "loss": 0.0004, + "num_tokens": 68773755.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 119.625, + "completions/mean_terminated_length": 119.625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.3418408592216191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.007682080457016127, + "learning_rate": 2.0819999999999997e-06, + "loss": 0.0003, + "num_tokens": 68785775.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 162.96875, + "completions/mean_terminated_length": 162.96875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.3419563459983832, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.025295826460933313, + "learning_rate": 2.08e-06, + "loss": 0.001, + "num_tokens": 68811630.0, + "reward": 3.1271004676818848, + "reward_std": 0.04246334731578827, + "rewards/reward_fn/mean": 3.1271004676818848, + "rewards/reward_fn/std": 0.04246333986520767, + "step": 2961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 70.8125, + "completions/mean_terminated_length": 70.8125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.34207183277514724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.208984375, + "kl": 0.021948675290332176, + "learning_rate": 2.078e-06, + "loss": 0.0009, + "num_tokens": 68835048.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 115.09375, + "completions/mean_terminated_length": 115.09375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.3421873195519113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.004646653207601048, + "learning_rate": 2.076e-06, + "loss": 0.0002, + "num_tokens": 68855851.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 178.6875, + "completions/mean_terminated_length": 178.6875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.3423028063286754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.0070450536732096225, + "learning_rate": 2.0739999999999995e-06, + "loss": 0.0003, + "num_tokens": 68889505.0, + "reward": 3.1667678356170654, + "reward_std": 0.09283240884542465, + "rewards/reward_fn/mean": 3.1667678356170654, + "rewards/reward_fn/std": 0.09283239394426346, + "step": 2964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 70.5625, + "completions/mean_terminated_length": 70.5625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.3424182931054394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.008122814982925775, + "learning_rate": 2.072e-06, + "loss": 0.0003, + "num_tokens": 68905715.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.3425337798822035, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.0100383323879214, + "learning_rate": 2.0699999999999997e-06, + "loss": 0.0004, + "num_tokens": 68936919.0, + "reward": 3.0881733894348145, + "reward_std": 0.3394404351711273, + "rewards/reward_fn/mean": 3.0881733894348145, + "rewards/reward_fn/std": 0.3394404351711273, + "step": 2966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 87.0, + "completions/max_terminated_length": 87.0, + "completions/mean_length": 53.875, + "completions/mean_terminated_length": 53.875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.34264926665896756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.005284007884256425, + "learning_rate": 2.068e-06, + "loss": 0.0002, + "num_tokens": 68960627.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 140.09375, + "completions/mean_terminated_length": 140.09375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.3427647534357316, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.011821897409390658, + "learning_rate": 2.0659999999999998e-06, + "loss": 0.0005, + "num_tokens": 68980918.0, + "reward": 3.635092258453369, + "reward_std": 0.5504845380783081, + "rewards/reward_fn/mean": 3.635092258453369, + "rewards/reward_fn/std": 0.5504845976829529, + "step": 2968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 94.0625, + "completions/mean_terminated_length": 94.0625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.3428802402124957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.005505482407897944, + "learning_rate": 2.064e-06, + "loss": 0.0002, + "num_tokens": 68991832.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 202.71875, + "completions/mean_terminated_length": 202.71875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.34299572698925973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.007610151013068389, + "learning_rate": 2.062e-06, + "loss": 0.0003, + "num_tokens": 69010607.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 216.71875, + "completions/mean_terminated_length": 216.71875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.34311121376602377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.010908690572250634, + "learning_rate": 2.0599999999999998e-06, + "loss": 0.0004, + "num_tokens": 69030726.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 133.34375, + "completions/mean_terminated_length": 133.34375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.34322670054278787, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.00793834199430421, + "learning_rate": 2.0579999999999996e-06, + "loss": 0.0003, + "num_tokens": 69057425.0, + "reward": 3.9565839767456055, + "reward_std": 0.17185138165950775, + "rewards/reward_fn/mean": 3.9565839767456055, + "rewards/reward_fn/std": 0.17185135185718536, + "step": 2972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 175.625, + "completions/mean_terminated_length": 175.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.3433421873195519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044189453125, + "kl": 0.007949996906972956, + "learning_rate": 2.056e-06, + "loss": 0.0003, + "num_tokens": 69075333.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 156.84375, + "completions/mean_terminated_length": 156.84375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.34345767409631595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78515625, + "kl": 0.009973975371394772, + "learning_rate": 2.0539999999999997e-06, + "loss": 0.0004, + "num_tokens": 69103872.0, + "reward": 3.931849241256714, + "reward_std": 0.38551923632621765, + "rewards/reward_fn/mean": 3.931849241256714, + "rewards/reward_fn/std": 0.3855191767215729, + "step": 2974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 121.28125, + "completions/mean_terminated_length": 121.28125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.34357316087308004, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.234375, + "kl": 0.01722863971372135, + "learning_rate": 2.052e-06, + "loss": 0.0007, + "num_tokens": 69125449.0, + "reward": 3.8952102661132812, + "reward_std": 0.28381478786468506, + "rewards/reward_fn/mean": 3.8952102661132812, + "rewards/reward_fn/std": 0.28381475806236267, + "step": 2975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 129.15625, + "completions/mean_terminated_length": 129.15625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3436886476498441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.01413770618091803, + "learning_rate": 2.05e-06, + "loss": 0.0006, + "num_tokens": 69151662.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 266.0, + "completions/mean_terminated_length": 266.0, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.3438041344266082, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.013897785043809563, + "learning_rate": 2.048e-06, + "loss": 0.0006, + "num_tokens": 69186030.0, + "reward": 3.698052406311035, + "reward_std": 0.8118411302566528, + "rewards/reward_fn/mean": 3.698052406311035, + "rewards/reward_fn/std": 0.8118411302566528, + "step": 2977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 189.53125, + "completions/mean_terminated_length": 189.53125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.3439196212033722, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.009571558519382961, + "learning_rate": 2.0459999999999996e-06, + "loss": 0.0004, + "num_tokens": 69210847.0, + "reward": 3.931130886077881, + "reward_std": 0.2211967557668686, + "rewards/reward_fn/mean": 3.931130886077881, + "rewards/reward_fn/std": 0.22119677066802979, + "step": 2978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 209.0, + "completions/mean_terminated_length": 209.0, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.34403510798013626, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.008956409015809186, + "learning_rate": 2.044e-06, + "loss": 0.0004, + "num_tokens": 69229471.0, + "reward": 3.1354804039001465, + "reward_std": 0.786696195602417, + "rewards/reward_fn/mean": 3.1354804039001465, + "rewards/reward_fn/std": 0.786696195602417, + "step": 2979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.34415059475690035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.00437160945875803, + "learning_rate": 2.0419999999999997e-06, + "loss": 0.0002, + "num_tokens": 69251499.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 203.125, + "completions/mean_terminated_length": 203.125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3442660815336644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.009973241059924476, + "learning_rate": 2.04e-06, + "loss": 0.0004, + "num_tokens": 69277391.0, + "reward": 3.9048895835876465, + "reward_std": 0.41644394397735596, + "rewards/reward_fn/mean": 3.9048895835876465, + "rewards/reward_fn/std": 0.41644397377967834, + "step": 2981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 303.59375, + "completions/mean_terminated_length": 303.59375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.34438156831042843, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.01202208576432895, + "learning_rate": 2.038e-06, + "loss": 0.0005, + "num_tokens": 69303138.0, + "reward": 3.9276561737060547, + "reward_std": 0.40923914313316345, + "rewards/reward_fn/mean": 3.9276561737060547, + "rewards/reward_fn/std": 0.4092392027378082, + "step": 2982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 424.28125, + "completions/mean_terminated_length": 424.28125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.3444970550871925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.009240274172043428, + "learning_rate": 2.036e-06, + "loss": 0.0004, + "num_tokens": 69332907.0, + "reward": 2.9491238594055176, + "reward_std": 0.49970752000808716, + "rewards/reward_fn/mean": 2.9491238594055176, + "rewards/reward_fn/std": 0.49970749020576477, + "step": 2983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 171.65625, + "completions/mean_terminated_length": 171.65625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.34461254186395657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.012800505937775597, + "learning_rate": 2.034e-06, + "loss": 0.0005, + "num_tokens": 69356736.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 107.25, + "completions/mean_terminated_length": 107.25, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.34472802864072066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.006519007842143765, + "learning_rate": 2.0319999999999998e-06, + "loss": 0.0003, + "num_tokens": 69372136.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 258.46875, + "completions/mean_terminated_length": 258.46875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.3448435154174847, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.011345405131578445, + "learning_rate": 2.0299999999999996e-06, + "loss": 0.0005, + "num_tokens": 69399639.0, + "reward": 3.5216243267059326, + "reward_std": 0.7838878631591797, + "rewards/reward_fn/mean": 3.5216243267059326, + "rewards/reward_fn/std": 0.7838878631591797, + "step": 2986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 340.6875, + "completions/mean_terminated_length": 340.6875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.34495900219424874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.009064502461114898, + "learning_rate": 2.028e-06, + "loss": 0.0004, + "num_tokens": 69426573.0, + "reward": 3.8582797050476074, + "reward_std": 0.5577109456062317, + "rewards/reward_fn/mean": 3.8582797050476074, + "rewards/reward_fn/std": 0.5577109456062317, + "step": 2987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 143.625, + "completions/mean_terminated_length": 143.625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.34507448897101284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.00811011390760541, + "learning_rate": 2.0259999999999997e-06, + "loss": 0.0003, + "num_tokens": 69448929.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 113.71875, + "completions/mean_terminated_length": 113.71875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.3451899757477769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.008249523387348745, + "learning_rate": 2.024e-06, + "loss": 0.0003, + "num_tokens": 69468632.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 436.03125, + "completions/mean_terminated_length": 436.03125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.3453054625245409, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.009045271814102307, + "learning_rate": 2.022e-06, + "loss": 0.0004, + "num_tokens": 69497977.0, + "reward": 3.9289703369140625, + "reward_std": 0.40180379152297974, + "rewards/reward_fn/mean": 3.9289703369140625, + "rewards/reward_fn/std": 0.4018038213253021, + "step": 2990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 287.0, + "completions/mean_terminated_length": 287.0, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.345420949301305, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.016418335391790606, + "learning_rate": 2.0199999999999997e-06, + "loss": 0.0007, + "num_tokens": 69525721.0, + "reward": 3.871016025543213, + "reward_std": 0.3528216481208801, + "rewards/reward_fn/mean": 3.871016025543213, + "rewards/reward_fn/std": 0.3528216481208801, + "step": 2991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 114.9375, + "completions/mean_terminated_length": 114.9375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.34553643607806905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.009427590142877307, + "learning_rate": 2.0179999999999996e-06, + "loss": 0.0004, + "num_tokens": 69556631.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 400.625, + "completions/mean_terminated_length": 347.4838562011719, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.34565192285483315, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.009465010909480043, + "learning_rate": 2.016e-06, + "loss": 0.0004, + "num_tokens": 69593035.0, + "reward": 3.665475368499756, + "reward_std": 0.9402363300323486, + "rewards/reward_fn/mean": 3.665475368499756, + "rewards/reward_fn/std": 0.9402363300323486, + "step": 2993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 65.0625, + "completions/mean_terminated_length": 65.0625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.3457674096315972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2021484375, + "kl": 0.011594950603466714, + "learning_rate": 2.0139999999999997e-06, + "loss": 0.0005, + "num_tokens": 69614189.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 455.4375, + "completions/mean_terminated_length": 455.4375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.34588289640836123, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.011285028012935072, + "learning_rate": 2.012e-06, + "loss": 0.0005, + "num_tokens": 69643707.0, + "reward": 3.6398050785064697, + "reward_std": 0.8505828976631165, + "rewards/reward_fn/mean": 3.6398050785064697, + "rewards/reward_fn/std": 0.8505828976631165, + "step": 2995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 120.0625, + "completions/mean_terminated_length": 120.0625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.3459983831851253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.016317753717885353, + "learning_rate": 2.01e-06, + "loss": 0.0007, + "num_tokens": 69659453.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 181.96875, + "completions/mean_terminated_length": 181.96875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.34611386996188936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.007316946299397387, + "learning_rate": 2.008e-06, + "loss": 0.0003, + "num_tokens": 69677980.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 94.71875, + "completions/mean_terminated_length": 94.71875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.3462293567386534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.007589893833937822, + "learning_rate": 2.0059999999999995e-06, + "loss": 0.0003, + "num_tokens": 69700179.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 167.75, + "completions/mean_terminated_length": 167.75, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.3463448435154175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.015748887904919684, + "learning_rate": 2.0039999999999998e-06, + "loss": 0.0006, + "num_tokens": 69728395.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 2999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 231.0, + "completions/mean_terminated_length": 231.0, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.34646033029218154, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.011973690532613546, + "learning_rate": 2.0019999999999996e-06, + "loss": 0.0005, + "num_tokens": 69748395.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 76.9375, + "completions/mean_terminated_length": 76.9375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.3465758170689456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.011412360130634625, + "learning_rate": 2e-06, + "loss": 0.0005, + "num_tokens": 69762601.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 192.75, + "completions/mean_terminated_length": 192.75, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.3466913038457097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.009277068129449617, + "learning_rate": 1.9979999999999998e-06, + "loss": 0.0004, + "num_tokens": 69785601.0, + "reward": 3.91611909866333, + "reward_std": 0.2666860818862915, + "rewards/reward_fn/mean": 3.91611909866333, + "rewards/reward_fn/std": 0.2666860520839691, + "step": 3002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 216.59375, + "completions/mean_terminated_length": 216.59375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.3468067906224737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.020430062868399546, + "learning_rate": 1.996e-06, + "loss": 0.0008, + "num_tokens": 69815316.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 234.34375, + "completions/mean_terminated_length": 234.34375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.3469222773992378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.011181408910488244, + "learning_rate": 1.994e-06, + "loss": 0.0004, + "num_tokens": 69846943.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 126.3125, + "completions/mean_terminated_length": 126.3125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.34703776417600185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.008823028867482208, + "learning_rate": 1.9919999999999997e-06, + "loss": 0.0004, + "num_tokens": 69876937.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 257.1875, + "completions/mean_terminated_length": 257.1875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.3471532509527659, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.007954594526381698, + "learning_rate": 1.99e-06, + "loss": 0.0003, + "num_tokens": 69905295.0, + "reward": 2.8537440299987793, + "reward_std": 0.2149791270494461, + "rewards/reward_fn/mean": 2.8537440299987793, + "rewards/reward_fn/std": 0.2149791419506073, + "step": 3006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 362.46875, + "completions/mean_terminated_length": 308.0967712402344, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.34726873772953, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.02106434640882071, + "learning_rate": 1.988e-06, + "loss": 0.0008, + "num_tokens": 69940670.0, + "reward": 2.051478147506714, + "reward_std": 0.6623335480690002, + "rewards/reward_fn/mean": 2.051478147506714, + "rewards/reward_fn/std": 0.6623334884643555, + "step": 3007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 91.9375, + "completions/mean_terminated_length": 91.9375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.347384224506294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.007862537586333929, + "learning_rate": 1.9859999999999997e-06, + "loss": 0.0003, + "num_tokens": 69964828.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 248.875, + "completions/mean_terminated_length": 248.875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.34749971128305807, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.007766717561025871, + "learning_rate": 1.984e-06, + "loss": 0.0003, + "num_tokens": 69987320.0, + "reward": 3.8286752700805664, + "reward_std": 0.5862489938735962, + "rewards/reward_fn/mean": 3.8286752700805664, + "rewards/reward_fn/std": 0.586249053478241, + "step": 3009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 161.53125, + "completions/mean_terminated_length": 161.53125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.34761519805982216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.0060188636925886385, + "learning_rate": 1.982e-06, + "loss": 0.0002, + "num_tokens": 70016073.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 324.53125, + "completions/mean_terminated_length": 324.53125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.3477306848365862, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.013219913947978057, + "learning_rate": 1.98e-06, + "loss": 0.0005, + "num_tokens": 70034714.0, + "reward": 3.267841339111328, + "reward_std": 0.5913332104682922, + "rewards/reward_fn/mean": 3.267841339111328, + "rewards/reward_fn/std": 0.5913331508636475, + "step": 3011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 63.4375, + "completions/mean_terminated_length": 63.4375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.3478461716133503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.00924180168658495, + "learning_rate": 1.978e-06, + "loss": 0.0004, + "num_tokens": 70052648.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 205.71875, + "completions/mean_terminated_length": 205.71875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.34796165839011434, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.00796643395733554, + "learning_rate": 1.976e-06, + "loss": 0.0003, + "num_tokens": 70075711.0, + "reward": 3.9261133670806885, + "reward_std": 0.4179660677909851, + "rewards/reward_fn/mean": 3.9261133670806885, + "rewards/reward_fn/std": 0.4179660975933075, + "step": 3013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 172.71875, + "completions/mean_terminated_length": 172.71875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.3480771451668784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.010743919279775582, + "learning_rate": 1.974e-06, + "loss": 0.0004, + "num_tokens": 70104790.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 183.6875, + "completions/mean_terminated_length": 183.6875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.34819263194364247, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.025087849789997563, + "learning_rate": 1.972e-06, + "loss": 0.001, + "num_tokens": 70121868.0, + "reward": 3.0704522132873535, + "reward_std": 0.281649649143219, + "rewards/reward_fn/mean": 3.0704522132873535, + "rewards/reward_fn/std": 0.281649649143219, + "step": 3015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 250.6875, + "completions/mean_terminated_length": 250.6875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.3483081187204065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.008393459851504304, + "learning_rate": 1.9699999999999998e-06, + "loss": 0.0003, + "num_tokens": 70144450.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 69.90625, + "completions/mean_terminated_length": 69.90625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.34842360549717055, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.005709315420972416, + "learning_rate": 1.968e-06, + "loss": 0.0002, + "num_tokens": 70154495.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 73.9375, + "completions/mean_terminated_length": 73.9375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.34853909227393465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15625, + "kl": 0.01407857408048585, + "learning_rate": 1.966e-06, + "loss": 0.0006, + "num_tokens": 70170269.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 66.125, + "completions/mean_terminated_length": 66.125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.3486545790506987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.0037299550349416677, + "learning_rate": 1.9639999999999997e-06, + "loss": 0.0001, + "num_tokens": 70189665.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 196.40625, + "completions/mean_terminated_length": 196.40625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.3487700658274628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.008126474000164308, + "learning_rate": 1.962e-06, + "loss": 0.0003, + "num_tokens": 70208526.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 260.3125, + "completions/mean_terminated_length": 260.3125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.3488855526042268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03955078125, + "kl": 0.007450745368259959, + "learning_rate": 1.96e-06, + "loss": 0.0003, + "num_tokens": 70231800.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 226.34375, + "completions/mean_terminated_length": 226.34375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.34900103938099086, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.00999067080556415, + "learning_rate": 1.9579999999999997e-06, + "loss": 0.0004, + "num_tokens": 70254083.0, + "reward": 3.969867706298828, + "reward_std": 0.17045453190803528, + "rewards/reward_fn/mean": 3.969867706298828, + "rewards/reward_fn/std": 0.17045453190803528, + "step": 3022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 125.4375, + "completions/mean_terminated_length": 125.4375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.34911652615775496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.011959457864577416, + "learning_rate": 1.956e-06, + "loss": 0.0005, + "num_tokens": 70274353.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 223.40625, + "completions/mean_terminated_length": 223.40625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.349232012934519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.007111247185093816, + "learning_rate": 1.954e-06, + "loss": 0.0003, + "num_tokens": 70298046.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 170.84375, + "completions/mean_terminated_length": 170.84375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.34934749971128304, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.008253769447037484, + "learning_rate": 1.9519999999999997e-06, + "loss": 0.0003, + "num_tokens": 70323065.0, + "reward": 3.7906301021575928, + "reward_std": 0.5786857604980469, + "rewards/reward_fn/mean": 3.7906301021575928, + "rewards/reward_fn/std": 0.5786857604980469, + "step": 3025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 115.21875, + "completions/mean_terminated_length": 115.21875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.34946298648804713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.01166815423493972, + "learning_rate": 1.95e-06, + "loss": 0.0005, + "num_tokens": 70342944.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 165.78125, + "completions/mean_terminated_length": 165.78125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.3495784732648112, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.015527570649283007, + "learning_rate": 1.948e-06, + "loss": 0.0006, + "num_tokens": 70361721.0, + "reward": 3.8931403160095215, + "reward_std": 0.41971689462661743, + "rewards/reward_fn/mean": 3.8931403160095215, + "rewards/reward_fn/std": 0.41971686482429504, + "step": 3027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1316.0, + "completions/max_terminated_length": 1316.0, + "completions/mean_length": 456.0625, + "completions/mean_terminated_length": 456.0625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.3496939600415752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.01527469465509057, + "learning_rate": 1.9459999999999997e-06, + "loss": 0.0006, + "num_tokens": 70391611.0, + "reward": 3.9313015937805176, + "reward_std": 0.3886173963546753, + "rewards/reward_fn/mean": 3.9313015937805176, + "rewards/reward_fn/std": 0.3886173665523529, + "step": 3028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 68.65625, + "completions/mean_terminated_length": 68.65625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.3498094468183393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.004426899900863646, + "learning_rate": 1.944e-06, + "loss": 0.0002, + "num_tokens": 70410576.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 104.03125, + "completions/mean_terminated_length": 104.03125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.34992493359510335, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0, + "kl": 0.008108455027468153, + "learning_rate": 1.9419999999999998e-06, + "loss": 0.0003, + "num_tokens": 70427153.0, + "reward": 3.928506851196289, + "reward_std": 0.22637367248535156, + "rewards/reward_fn/mean": 3.928506851196289, + "rewards/reward_fn/std": 0.22637364268302917, + "step": 3030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 177.53125, + "completions/mean_terminated_length": 177.53125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.35004042037186744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.007573478840640746, + "learning_rate": 1.94e-06, + "loss": 0.0003, + "num_tokens": 70445954.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 387.75, + "completions/mean_terminated_length": 387.75, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.3501559071486315, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.008495380956446752, + "learning_rate": 1.938e-06, + "loss": 0.0003, + "num_tokens": 70468186.0, + "reward": 3.8864517211914062, + "reward_std": 0.4671589434146881, + "rewards/reward_fn/mean": 3.8864517211914062, + "rewards/reward_fn/std": 0.4671589434146881, + "step": 3032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 154.0625, + "completions/mean_terminated_length": 154.0625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.3502713939253955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.014463778570643626, + "learning_rate": 1.9359999999999998e-06, + "loss": 0.0006, + "num_tokens": 70491260.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 320.25, + "completions/mean_terminated_length": 320.25, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.3503868807021596, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.011121483257738873, + "learning_rate": 1.934e-06, + "loss": 0.0004, + "num_tokens": 70514468.0, + "reward": 2.917451858520508, + "reward_std": 0.4177452325820923, + "rewards/reward_fn/mean": 2.917451858520508, + "rewards/reward_fn/std": 0.4177452027797699, + "step": 3034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 184.6875, + "completions/mean_terminated_length": 184.6875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.35050236747892366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.00873806176969083, + "learning_rate": 1.932e-06, + "loss": 0.0003, + "num_tokens": 70531386.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 294.5, + "completions/mean_terminated_length": 294.5, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.3506178542556877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.013127835292834789, + "learning_rate": 1.9299999999999997e-06, + "loss": 0.0005, + "num_tokens": 70565066.0, + "reward": 3.4306347370147705, + "reward_std": 0.3714104890823364, + "rewards/reward_fn/mean": 3.4306347370147705, + "rewards/reward_fn/std": 0.3714104890823364, + "step": 3036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 158.59375, + "completions/mean_terminated_length": 158.59375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.3507333410324518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.011718445653968956, + "learning_rate": 1.928e-06, + "loss": 0.0005, + "num_tokens": 70588253.0, + "reward": 3.9734292030334473, + "reward_std": 0.15030695497989655, + "rewards/reward_fn/mean": 3.9734292030334473, + "rewards/reward_fn/std": 0.15030691027641296, + "step": 3037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 222.71875, + "completions/mean_terminated_length": 222.71875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.35084882780921584, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.01497876005305443, + "learning_rate": 1.926e-06, + "loss": 0.0006, + "num_tokens": 70613780.0, + "reward": 3.2665047645568848, + "reward_std": 0.323147714138031, + "rewards/reward_fn/mean": 3.2665047645568848, + "rewards/reward_fn/std": 0.323147714138031, + "step": 3038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 69.3125, + "completions/mean_terminated_length": 69.3125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.35096431458597993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.00912076420354424, + "learning_rate": 1.9239999999999997e-06, + "loss": 0.0004, + "num_tokens": 70629758.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 382.34375, + "completions/mean_terminated_length": 382.34375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.35107980136274397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.008625605361885391, + "learning_rate": 1.922e-06, + "loss": 0.0003, + "num_tokens": 70651849.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.0, + "completions/max_terminated_length": 100.0, + "completions/mean_length": 72.8125, + "completions/mean_terminated_length": 72.8125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.351195288139508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04345703125, + "kl": 0.0037222348855721066, + "learning_rate": 1.92e-06, + "loss": 0.0001, + "num_tokens": 70674851.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1216.0, + "completions/max_terminated_length": 1216.0, + "completions/mean_length": 328.09375, + "completions/mean_terminated_length": 328.09375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.3513107749162721, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.020688027259893715, + "learning_rate": 1.9179999999999997e-06, + "loss": 0.0008, + "num_tokens": 70707654.0, + "reward": 2.8625245094299316, + "reward_std": 0.3879382908344269, + "rewards/reward_fn/mean": 2.8625245094299316, + "rewards/reward_fn/std": 0.3879382908344269, + "step": 3042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 66.28125, + "completions/mean_terminated_length": 66.28125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.35142626169303615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.010037255266070133, + "learning_rate": 1.916e-06, + "loss": 0.0004, + "num_tokens": 70719183.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 115.875, + "completions/mean_terminated_length": 115.875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.3515417484698002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.009861336431640666, + "learning_rate": 1.914e-06, + "loss": 0.0004, + "num_tokens": 70740075.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 83.65625, + "completions/mean_terminated_length": 83.65625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.3516572352465643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.007949801878567087, + "learning_rate": 1.9119999999999997e-06, + "loss": 0.0003, + "num_tokens": 70759776.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 185.0, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.3517727220233283, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.008598219785199035, + "learning_rate": 1.91e-06, + "loss": 0.0003, + "num_tokens": 70791232.0, + "reward": 3.977659225463867, + "reward_std": 0.12637856602668762, + "rewards/reward_fn/mean": 3.977659225463867, + "rewards/reward_fn/std": 0.12637858092784882, + "step": 3046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 182.90625, + "completions/mean_terminated_length": 182.90625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.3518882088000924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04736328125, + "kl": 0.009424264397239313, + "learning_rate": 1.9079999999999998e-06, + "loss": 0.0004, + "num_tokens": 70806685.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 115.84375, + "completions/mean_terminated_length": 115.84375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.35200369557685646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.010937618761090562, + "learning_rate": 1.9059999999999998e-06, + "loss": 0.0004, + "num_tokens": 70830008.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 146.375, + "completions/mean_terminated_length": 146.375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.3521191823536205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.008816579244012246, + "learning_rate": 1.904e-06, + "loss": 0.0004, + "num_tokens": 70846084.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 209.09375, + "completions/mean_terminated_length": 209.09375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.3522346691303846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.006748559761035722, + "learning_rate": 1.9019999999999997e-06, + "loss": 0.0003, + "num_tokens": 70862055.0, + "reward": 2.8954672813415527, + "reward_std": 0.02349381148815155, + "rewards/reward_fn/mean": 2.8954672813415527, + "rewards/reward_fn/std": 0.023493783548474312, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 124.4375, + "completions/mean_terminated_length": 124.4375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.35235015590714863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.00865077564958483, + "learning_rate": 1.8999999999999998e-06, + "loss": 0.0003, + "num_tokens": 70877941.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 155.5625, + "completions/mean_terminated_length": 155.5625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.35246564268391267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.007396655004413333, + "learning_rate": 1.8979999999999999e-06, + "loss": 0.0003, + "num_tokens": 70902791.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 241.65625, + "completions/mean_terminated_length": 241.65625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.35258112946067677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0576171875, + "kl": 0.009153904131380841, + "learning_rate": 1.8959999999999997e-06, + "loss": 0.0004, + "num_tokens": 70921852.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 267.03125, + "completions/mean_terminated_length": 267.03125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.3526966162374408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.013156847708160058, + "learning_rate": 1.8939999999999998e-06, + "loss": 0.0005, + "num_tokens": 70942973.0, + "reward": 3.6584293842315674, + "reward_std": 0.8065954446792603, + "rewards/reward_fn/mean": 3.6584293842315674, + "rewards/reward_fn/std": 0.8065953850746155, + "step": 3054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 258.25, + "completions/mean_terminated_length": 258.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.35281210301420485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296875, + "kl": 0.009879358854959719, + "learning_rate": 1.8919999999999998e-06, + "loss": 0.0004, + "num_tokens": 70965797.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 189.46875, + "completions/mean_terminated_length": 189.46875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.35292758979096894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0400390625, + "kl": 0.0065879182366188616, + "learning_rate": 1.89e-06, + "loss": 0.0003, + "num_tokens": 70980052.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 112.0, + "completions/mean_terminated_length": 112.0, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.353043076567733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.015622089529642835, + "learning_rate": 1.8879999999999998e-06, + "loss": 0.0006, + "num_tokens": 70995924.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 163.03125, + "completions/mean_terminated_length": 163.03125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.3531585633444971, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296875, + "kl": 0.007871527152019553, + "learning_rate": 1.8859999999999998e-06, + "loss": 0.0003, + "num_tokens": 71013429.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 214.0, + "completions/mean_terminated_length": 214.0, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.3532740501212611, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.015986798782250844, + "learning_rate": 1.8839999999999999e-06, + "loss": 0.0006, + "num_tokens": 71032661.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 188.125, + "completions/mean_terminated_length": 188.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.35338953689802516, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.007581748788652476, + "learning_rate": 1.8819999999999997e-06, + "loss": 0.0003, + "num_tokens": 71057209.0, + "reward": 3.3775153160095215, + "reward_std": 0.6014329195022583, + "rewards/reward_fn/mean": 3.3775153160095215, + "rewards/reward_fn/std": 0.6014328598976135, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 119.40625, + "completions/mean_terminated_length": 119.40625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.35350502367478925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.0044114026150055, + "learning_rate": 1.8799999999999998e-06, + "loss": 0.0002, + "num_tokens": 71081606.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 1182.0, + "completions/mean_length": 247.65625, + "completions/mean_terminated_length": 247.65625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.3536205104515533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.010616690931783523, + "learning_rate": 1.8779999999999998e-06, + "loss": 0.0004, + "num_tokens": 71110459.0, + "reward": 3.815920829772949, + "reward_std": 0.7341221570968628, + "rewards/reward_fn/mean": 3.815920829772949, + "rewards/reward_fn/std": 0.7341221570968628, + "step": 3062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 203.96875, + "completions/mean_terminated_length": 203.96875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.35373599722831733, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.00764397787861526, + "learning_rate": 1.8759999999999997e-06, + "loss": 0.0003, + "num_tokens": 71141274.0, + "reward": 3.0066545009613037, + "reward_std": 0.21569593250751495, + "rewards/reward_fn/mean": 3.0066545009613037, + "rewards/reward_fn/std": 0.21569588780403137, + "step": 3063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 298.34375, + "completions/mean_terminated_length": 298.34375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.35385148400508143, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.009405124925251585, + "learning_rate": 1.874e-06, + "loss": 0.0004, + "num_tokens": 71158981.0, + "reward": 3.8954973220825195, + "reward_std": 0.44860324263572693, + "rewards/reward_fn/mean": 3.8954973220825195, + "rewards/reward_fn/std": 0.44860324263572693, + "step": 3064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 147.71875, + "completions/mean_terminated_length": 147.71875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.35396697078184547, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015625, + "kl": 0.013916782918386161, + "learning_rate": 1.872e-06, + "loss": 0.0006, + "num_tokens": 71185308.0, + "reward": 3.8859729766845703, + "reward_std": 0.27033597230911255, + "rewards/reward_fn/mean": 3.8859729766845703, + "rewards/reward_fn/std": 0.27033600211143494, + "step": 3065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 173.84375, + "completions/mean_terminated_length": 173.84375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.35408245755860956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.007581410449347459, + "learning_rate": 1.87e-06, + "loss": 0.0003, + "num_tokens": 71211031.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 258.5, + "completions/mean_terminated_length": 258.5, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.3541979443353736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.009405178221641108, + "learning_rate": 1.868e-06, + "loss": 0.0004, + "num_tokens": 71242439.0, + "reward": 3.5065555572509766, + "reward_std": 0.9077292084693909, + "rewards/reward_fn/mean": 3.5065555572509766, + "rewards/reward_fn/std": 0.9077292084693909, + "step": 3067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 103.21875, + "completions/mean_terminated_length": 103.21875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.35431343111213764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.010310021112672985, + "learning_rate": 1.866e-06, + "loss": 0.0004, + "num_tokens": 71259118.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 104.28125, + "completions/mean_terminated_length": 104.28125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.35442891788890174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.007604655264003668, + "learning_rate": 1.864e-06, + "loss": 0.0003, + "num_tokens": 71286807.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 125.4375, + "completions/mean_terminated_length": 125.4375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.3545444046656658, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.010083527777169365, + "learning_rate": 1.862e-06, + "loss": 0.0004, + "num_tokens": 71309829.0, + "reward": 3.979964256286621, + "reward_std": 0.11333901435136795, + "rewards/reward_fn/mean": 3.979964256286621, + "rewards/reward_fn/std": 0.11333902925252914, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 110.625, + "completions/mean_terminated_length": 110.625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.3546598914424298, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "kl": 0.01349670324998442, + "learning_rate": 1.86e-06, + "loss": 0.0005, + "num_tokens": 71327097.0, + "reward": 3.978750705718994, + "reward_std": 0.12020418792963028, + "rewards/reward_fn/mean": 3.978750705718994, + "rewards/reward_fn/std": 0.12020420283079147, + "step": 3071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 156.46875, + "completions/mean_terminated_length": 156.46875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.3547753782191939, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.006213046079210471, + "learning_rate": 1.858e-06, + "loss": 0.0002, + "num_tokens": 71344360.0, + "reward": 3.931994676589966, + "reward_std": 0.38469618558883667, + "rewards/reward_fn/mean": 3.931994676589966, + "rewards/reward_fn/std": 0.3846961557865143, + "step": 3072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 136.375, + "completions/mean_terminated_length": 136.375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.35489086499595796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.004040637461002916, + "learning_rate": 1.856e-06, + "loss": 0.0002, + "num_tokens": 71361332.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 208.0625, + "completions/mean_terminated_length": 208.0625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.35500635177272205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.007367807244008873, + "learning_rate": 1.854e-06, + "loss": 0.0003, + "num_tokens": 71396886.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 112.1875, + "completions/mean_terminated_length": 112.1875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.3551218385494861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.006222557018190855, + "learning_rate": 1.852e-06, + "loss": 0.0002, + "num_tokens": 71424540.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 349.9375, + "completions/mean_terminated_length": 349.9375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.35523732532625013, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036376953125, + "kl": 0.007604712031024974, + "learning_rate": 1.85e-06, + "loss": 0.0003, + "num_tokens": 71450874.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 73.65625, + "completions/mean_terminated_length": 73.65625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.3553528121030142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05712890625, + "kl": 0.004602192082529655, + "learning_rate": 1.848e-06, + "loss": 0.0002, + "num_tokens": 71462543.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 82.59375, + "completions/mean_terminated_length": 82.59375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.35546829887977827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.011501823239086661, + "learning_rate": 1.846e-06, + "loss": 0.0005, + "num_tokens": 71481954.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 179.5, + "completions/mean_terminated_length": 179.5, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.3555837856565423, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.010695923716411926, + "learning_rate": 1.844e-06, + "loss": 0.0004, + "num_tokens": 71500242.0, + "reward": 3.471677541732788, + "reward_std": 0.5374919176101685, + "rewards/reward_fn/mean": 3.471677541732788, + "rewards/reward_fn/std": 0.5374919176101685, + "step": 3079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 204.8125, + "completions/mean_terminated_length": 204.8125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.3556992724333064, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.009263058054784779, + "learning_rate": 1.842e-06, + "loss": 0.0004, + "num_tokens": 71533196.0, + "reward": 2.8521480560302734, + "reward_std": 0.44126835465431213, + "rewards/reward_fn/mean": 2.8521480560302734, + "rewards/reward_fn/std": 0.44126835465431213, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 302.34375, + "completions/mean_terminated_length": 302.34375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.35581475921007044, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.013602475984953344, + "learning_rate": 1.84e-06, + "loss": 0.0005, + "num_tokens": 71563863.0, + "reward": 3.2002875804901123, + "reward_std": 0.5232313275337219, + "rewards/reward_fn/mean": 3.2002875804901123, + "rewards/reward_fn/std": 0.5232312083244324, + "step": 3081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 137.03125, + "completions/mean_terminated_length": 137.03125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.3559302459868345, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.328125, + "kl": 0.0098865619947901, + "learning_rate": 1.838e-06, + "loss": 0.0004, + "num_tokens": 71586808.0, + "reward": 3.8830580711364746, + "reward_std": 0.2762562930583954, + "rewards/reward_fn/mean": 3.8830580711364746, + "rewards/reward_fn/std": 0.2762563228607178, + "step": 3082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 92.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 61.34375, + "completions/mean_terminated_length": 61.34375, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.3560457327635986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041259765625, + "kl": 0.0031072529700395535, + "learning_rate": 1.836e-06, + "loss": 0.0001, + "num_tokens": 71603619.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 147.8125, + "completions/mean_terminated_length": 147.8125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.3561612195403626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.007045509075396694, + "learning_rate": 1.834e-06, + "loss": 0.0003, + "num_tokens": 71626429.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 264.90625, + "completions/mean_terminated_length": 264.90625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3562767063171267, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0088060284324456, + "learning_rate": 1.832e-06, + "loss": 0.0004, + "num_tokens": 71659130.0, + "reward": 3.765367031097412, + "reward_std": 0.5423040986061096, + "rewards/reward_fn/mean": 3.765367031097412, + "rewards/reward_fn/std": 0.5423040986061096, + "step": 3085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 191.8125, + "completions/mean_terminated_length": 191.8125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.35639219309389075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042236328125, + "kl": 0.006095509234000929, + "learning_rate": 1.83e-06, + "loss": 0.0002, + "num_tokens": 71685652.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 398.75, + "completions/mean_terminated_length": 398.75, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.3565076798706548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.009244054926966783, + "learning_rate": 1.828e-06, + "loss": 0.0004, + "num_tokens": 71721036.0, + "reward": 3.5794034004211426, + "reward_std": 0.9505541324615479, + "rewards/reward_fn/mean": 3.5794034004211426, + "rewards/reward_fn/std": 0.9505540728569031, + "step": 3087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 297.15625, + "completions/mean_terminated_length": 297.15625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.3566231666474189, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.008946150832343847, + "learning_rate": 1.826e-06, + "loss": 0.0004, + "num_tokens": 71750193.0, + "reward": 3.761016845703125, + "reward_std": 0.6802463531494141, + "rewards/reward_fn/mean": 3.761016845703125, + "rewards/reward_fn/std": 0.6802462935447693, + "step": 3088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 227.4375, + "completions/mean_terminated_length": 227.4375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.3567386534241829, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.010473989095771685, + "learning_rate": 1.824e-06, + "loss": 0.0004, + "num_tokens": 71780255.0, + "reward": 3.2771658897399902, + "reward_std": 1.0416570901870728, + "rewards/reward_fn/mean": 3.2771658897399902, + "rewards/reward_fn/std": 1.0416570901870728, + "step": 3089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 120.6875, + "completions/mean_terminated_length": 120.6875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.35685414020094697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.012787660736648832, + "learning_rate": 1.8219999999999999e-06, + "loss": 0.0005, + "num_tokens": 71794933.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 271.09375, + "completions/mean_terminated_length": 271.09375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.35696962697771106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.011354067813954316, + "learning_rate": 1.82e-06, + "loss": 0.0005, + "num_tokens": 71819160.0, + "reward": 3.9651989936828613, + "reward_std": 0.19686387479305267, + "rewards/reward_fn/mean": 3.9651989936828613, + "rewards/reward_fn/std": 0.1968638300895691, + "step": 3091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 164.03125, + "completions/mean_terminated_length": 164.03125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.3570851137544751, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.00966921685176203, + "learning_rate": 1.818e-06, + "loss": 0.0004, + "num_tokens": 71835641.0, + "reward": 3.9706342220306396, + "reward_std": 0.16611799597740173, + "rewards/reward_fn/mean": 3.9706342220306396, + "rewards/reward_fn/std": 0.16611801087856293, + "step": 3092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 217.4375, + "completions/mean_terminated_length": 217.4375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.3572006005312392, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.009722117574710865, + "learning_rate": 1.816e-06, + "loss": 0.0004, + "num_tokens": 71861287.0, + "reward": 3.9271962642669678, + "reward_std": 0.4118400514125824, + "rewards/reward_fn/mean": 3.9271962642669678, + "rewards/reward_fn/std": 0.4118400812149048, + "step": 3093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 265.375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.35731608730800324, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.022095191845437512, + "learning_rate": 1.814e-06, + "loss": 0.0009, + "num_tokens": 71890899.0, + "reward": 3.266857862472534, + "reward_std": 0.18654154241085052, + "rewards/reward_fn/mean": 3.266857862472534, + "rewards/reward_fn/std": 0.18654155731201172, + "step": 3094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 465.65625, + "completions/mean_terminated_length": 465.65625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.3574315740847673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.0074371759765199386, + "learning_rate": 1.812e-06, + "loss": 0.0003, + "num_tokens": 71915560.0, + "reward": 3.929356336593628, + "reward_std": 0.39962056279182434, + "rewards/reward_fn/mean": 3.929356336593628, + "rewards/reward_fn/std": 0.39962053298950195, + "step": 3095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 80.90625, + "completions/mean_terminated_length": 80.90625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.3575470608615314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.00749238165008137, + "learning_rate": 1.81e-06, + "loss": 0.0003, + "num_tokens": 71931877.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 62.4375, + "completions/mean_terminated_length": 62.4375, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.3576625476382954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19921875, + "kl": 0.010312554646588978, + "learning_rate": 1.8079999999999999e-06, + "loss": 0.0004, + "num_tokens": 71945523.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 97.8125, + "completions/mean_terminated_length": 97.8125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.35777803441505945, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.008567137920181267, + "learning_rate": 1.806e-06, + "loss": 0.0003, + "num_tokens": 71966093.0, + "reward": 3.9712977409362793, + "reward_std": 0.16236479580402374, + "rewards/reward_fn/mean": 3.9712977409362793, + "rewards/reward_fn/std": 0.16236478090286255, + "step": 3098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 195.0625, + "completions/mean_terminated_length": 195.0625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.35789352119182355, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.021232706058071926, + "learning_rate": 1.804e-06, + "loss": 0.0009, + "num_tokens": 71989071.0, + "reward": 3.950456142425537, + "reward_std": 0.1961875706911087, + "rewards/reward_fn/mean": 3.950456142425537, + "rewards/reward_fn/std": 0.1961876004934311, + "step": 3099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 156.75, + "completions/mean_terminated_length": 156.75, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.3580090079685876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.011822294298326597, + "learning_rate": 1.802e-06, + "loss": 0.0005, + "num_tokens": 72016903.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 179.28125, + "completions/mean_terminated_length": 179.28125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.3581244947453517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.00958925143640954, + "learning_rate": 1.8e-06, + "loss": 0.0004, + "num_tokens": 72035696.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 339.71875, + "completions/mean_terminated_length": 339.71875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.3582399815221157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.009870299676549621, + "learning_rate": 1.798e-06, + "loss": 0.0004, + "num_tokens": 72061575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 164.9375, + "completions/mean_terminated_length": 164.9375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.35835546829887976, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.01086253221728839, + "learning_rate": 1.796e-06, + "loss": 0.0004, + "num_tokens": 72094629.0, + "reward": 3.9340286254882812, + "reward_std": 0.3731898367404938, + "rewards/reward_fn/mean": 3.9340286254882812, + "rewards/reward_fn/std": 0.3731898367404938, + "step": 3103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 159.21875, + "completions/mean_terminated_length": 159.21875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.35847095507564386, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.734375, + "kl": 0.016280231066048145, + "learning_rate": 1.7939999999999999e-06, + "loss": 0.0006, + "num_tokens": 72123820.0, + "reward": 2.9335365295410156, + "reward_std": 0.03969244658946991, + "rewards/reward_fn/mean": 2.9335365295410156, + "rewards/reward_fn/std": 0.039692435413599014, + "step": 3104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 181.0625, + "completions/mean_terminated_length": 181.0625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.3585864418524079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.007836669756215997, + "learning_rate": 1.792e-06, + "loss": 0.0003, + "num_tokens": 72151118.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 148.15625, + "completions/mean_terminated_length": 148.15625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.35870192862917194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.005970625043119071, + "learning_rate": 1.79e-06, + "loss": 0.0002, + "num_tokens": 72178067.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 76.9375, + "completions/mean_terminated_length": 76.9375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.35881741540593604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.007895239126810338, + "learning_rate": 1.7879999999999999e-06, + "loss": 0.0003, + "num_tokens": 72190193.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 358.625, + "completions/mean_terminated_length": 358.625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.3589329021827001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.010514055116800591, + "learning_rate": 1.786e-06, + "loss": 0.0004, + "num_tokens": 72220933.0, + "reward": 3.6967999935150146, + "reward_std": 0.7025695443153381, + "rewards/reward_fn/mean": 3.6967999935150146, + "rewards/reward_fn/std": 0.7025694847106934, + "step": 3108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 189.09375, + "completions/mean_terminated_length": 189.09375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.3590483889594641, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.010211567736405414, + "learning_rate": 1.784e-06, + "loss": 0.0004, + "num_tokens": 72238952.0, + "reward": 3.9036340713500977, + "reward_std": 0.2592145800590515, + "rewards/reward_fn/mean": 3.9036340713500977, + "rewards/reward_fn/std": 0.2592145502567291, + "step": 3109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 274.84375, + "completions/mean_terminated_length": 274.84375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.3591638757362282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.012315795131144114, + "learning_rate": 1.782e-06, + "loss": 0.0005, + "num_tokens": 72260259.0, + "reward": 3.9340124130249023, + "reward_std": 0.3732813000679016, + "rewards/reward_fn/mean": 3.9340124130249023, + "rewards/reward_fn/std": 0.3732813000679016, + "step": 3110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 177.4375, + "completions/mean_terminated_length": 177.4375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.35927936251299225, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.0099885134404758, + "learning_rate": 1.78e-06, + "loss": 0.0004, + "num_tokens": 72286865.0, + "reward": 3.9760396480560303, + "reward_std": 0.13554035127162933, + "rewards/reward_fn/mean": 3.9760396480560303, + "rewards/reward_fn/std": 0.13554035127162933, + "step": 3111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 403.6875, + "completions/mean_terminated_length": 403.6875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.35939484928975635, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.008619454121799208, + "learning_rate": 1.778e-06, + "loss": 0.0003, + "num_tokens": 72309447.0, + "reward": 3.9295201301574707, + "reward_std": 0.3986952006816864, + "rewards/reward_fn/mean": 3.9295201301574707, + "rewards/reward_fn/std": 0.3986952006816864, + "step": 3112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 105.1875, + "completions/mean_terminated_length": 105.1875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3595103360665204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.009975738583307248, + "learning_rate": 1.776e-06, + "loss": 0.0004, + "num_tokens": 72334573.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 99.1875, + "completions/mean_terminated_length": 99.1875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3596258228432844, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4375, + "kl": 0.008418052639171947, + "learning_rate": 1.7739999999999999e-06, + "loss": 0.0003, + "num_tokens": 72350963.0, + "reward": 3.961111545562744, + "reward_std": 0.15311133861541748, + "rewards/reward_fn/mean": 3.961111545562744, + "rewards/reward_fn/std": 0.15311136841773987, + "step": 3114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 75.15625, + "completions/mean_terminated_length": 75.15625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.3597413096200485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.006254315945625422, + "learning_rate": 1.772e-06, + "loss": 0.0003, + "num_tokens": 72375864.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 237.8125, + "completions/mean_terminated_length": 237.8125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.35985679639681256, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.008160210869391449, + "learning_rate": 1.77e-06, + "loss": 0.0003, + "num_tokens": 72395858.0, + "reward": 3.8873040676116943, + "reward_std": 0.3152139186859131, + "rewards/reward_fn/mean": 3.8873040676116943, + "rewards/reward_fn/std": 0.3152139484882355, + "step": 3116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 86.78125, + "completions/mean_terminated_length": 86.78125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3599722831735766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.006557383996550925, + "learning_rate": 1.7679999999999998e-06, + "loss": 0.0003, + "num_tokens": 72414923.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 225.84375, + "completions/mean_terminated_length": 225.84375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.3600877699503407, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.009553814874379896, + "learning_rate": 1.766e-06, + "loss": 0.0004, + "num_tokens": 72441254.0, + "reward": 3.71158766746521, + "reward_std": 0.7754194140434265, + "rewards/reward_fn/mean": 3.71158766746521, + "rewards/reward_fn/std": 0.7754194140434265, + "step": 3118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 185.71875, + "completions/mean_terminated_length": 185.71875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.36020325672710474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.006956809585972223, + "learning_rate": 1.764e-06, + "loss": 0.0003, + "num_tokens": 72471933.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 84.0, + "completions/max_terminated_length": 84.0, + "completions/mean_length": 55.3125, + "completions/mean_terminated_length": 55.3125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.36031874350386883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.004527585120740696, + "learning_rate": 1.762e-06, + "loss": 0.0002, + "num_tokens": 72498119.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 99.125, + "completions/mean_terminated_length": 99.125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.3604342302806329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.005187251383176772, + "learning_rate": 1.7599999999999999e-06, + "loss": 0.0002, + "num_tokens": 72521259.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 234.03125, + "completions/mean_terminated_length": 234.03125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.3605497170573969, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.01553030982177006, + "learning_rate": 1.758e-06, + "loss": 0.0006, + "num_tokens": 72553260.0, + "reward": 3.7886171340942383, + "reward_std": 0.6677354574203491, + "rewards/reward_fn/mean": 3.7886171340942383, + "rewards/reward_fn/std": 0.6677355170249939, + "step": 3122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 167.28125, + "completions/mean_terminated_length": 167.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.360665203834161, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.009766643706825562, + "learning_rate": 1.756e-06, + "loss": 0.0004, + "num_tokens": 72570293.0, + "reward": 3.9313161373138428, + "reward_std": 0.3885347843170166, + "rewards/reward_fn/mean": 3.9313161373138428, + "rewards/reward_fn/std": 0.388534814119339, + "step": 3123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 118.1875, + "completions/mean_terminated_length": 118.1875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.36078069061092505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.00792734143396956, + "learning_rate": 1.7539999999999999e-06, + "loss": 0.0003, + "num_tokens": 72586363.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 118.90625, + "completions/mean_terminated_length": 118.90625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3608961773876891, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.010479985940037295, + "learning_rate": 1.752e-06, + "loss": 0.0004, + "num_tokens": 72605976.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 88.875, + "completions/mean_terminated_length": 88.875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3610116641644532, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.008385728542634752, + "learning_rate": 1.75e-06, + "loss": 0.0003, + "num_tokens": 72626452.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 171.0, + "completions/mean_terminated_length": 171.0, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.3611271509412172, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.018557769784820266, + "learning_rate": 1.7479999999999998e-06, + "loss": 0.0007, + "num_tokens": 72642708.0, + "reward": 3.0981006622314453, + "reward_std": 0.037376485764980316, + "rewards/reward_fn/mean": 3.0981006622314453, + "rewards/reward_fn/std": 0.03737647831439972, + "step": 3127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 434.40625, + "completions/mean_terminated_length": 434.40625, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.3612426377179813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03369140625, + "kl": 0.007868165543186478, + "learning_rate": 1.7459999999999999e-06, + "loss": 0.0003, + "num_tokens": 72666433.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 295.5625, + "completions/mean_terminated_length": 295.5625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.36135812449474536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.00965807872125879, + "learning_rate": 1.744e-06, + "loss": 0.0004, + "num_tokens": 72687827.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 101.40625, + "completions/mean_terminated_length": 101.40625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.3614736112715094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05810546875, + "kl": 0.0056946578783936275, + "learning_rate": 1.742e-06, + "loss": 0.0002, + "num_tokens": 72704128.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 538.09375, + "completions/mean_terminated_length": 538.09375, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.3615890980482735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.00978213275084272, + "learning_rate": 1.7399999999999999e-06, + "loss": 0.0004, + "num_tokens": 72734179.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 147.875, + "completions/mean_terminated_length": 147.875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.36170458482503753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.008619331827503629, + "learning_rate": 1.738e-06, + "loss": 0.0003, + "num_tokens": 72752447.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 69.5625, + "completions/mean_terminated_length": 69.5625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.3618200716018016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.005865934101166204, + "learning_rate": 1.736e-06, + "loss": 0.0002, + "num_tokens": 72766449.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 284.4375, + "completions/mean_terminated_length": 284.4375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.36193555837856567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.0096291319350712, + "learning_rate": 1.7339999999999998e-06, + "loss": 0.0004, + "num_tokens": 72799007.0, + "reward": 3.860074996948242, + "reward_std": 0.3355824053287506, + "rewards/reward_fn/mean": 3.860074996948242, + "rewards/reward_fn/std": 0.33558234572410583, + "step": 3134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 253.71875, + "completions/mean_terminated_length": 253.71875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.3620510451553297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.014092451179749332, + "learning_rate": 1.7319999999999999e-06, + "loss": 0.0006, + "num_tokens": 72829750.0, + "reward": 3.541663885116577, + "reward_std": 0.7520524263381958, + "rewards/reward_fn/mean": 3.541663885116577, + "rewards/reward_fn/std": 0.7520524263381958, + "step": 3135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 283.3125, + "completions/mean_terminated_length": 283.3125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.36216653193209375, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.009137464272498619, + "learning_rate": 1.73e-06, + "loss": 0.0004, + "num_tokens": 72860672.0, + "reward": 3.0388708114624023, + "reward_std": 0.25366857647895813, + "rewards/reward_fn/mean": 3.0388708114624023, + "rewards/reward_fn/std": 0.25366854667663574, + "step": 3136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 107.75, + "completions/mean_terminated_length": 107.75, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.36228201870885784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.006697552373225335, + "learning_rate": 1.7279999999999998e-06, + "loss": 0.0003, + "num_tokens": 72875544.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 400.09375, + "completions/mean_terminated_length": 400.09375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.3623975054856219, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.009045321552548558, + "learning_rate": 1.7259999999999999e-06, + "loss": 0.0004, + "num_tokens": 72910779.0, + "reward": 3.936422348022461, + "reward_std": 0.2501838803291321, + "rewards/reward_fn/mean": 3.936422348022461, + "rewards/reward_fn/std": 0.2501838505268097, + "step": 3138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 82.53125, + "completions/mean_terminated_length": 82.53125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.362512992262386, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5, + "kl": 0.006240523327505798, + "learning_rate": 1.724e-06, + "loss": 0.0002, + "num_tokens": 72924236.0, + "reward": 3.855776071548462, + "reward_std": 0.3051668107509613, + "rewards/reward_fn/mean": 3.855776071548462, + "rewards/reward_fn/std": 0.3051668405532837, + "step": 3139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 177.5, + "completions/mean_terminated_length": 177.5, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.36262847903915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.011030159395886585, + "learning_rate": 1.722e-06, + "loss": 0.0004, + "num_tokens": 72942428.0, + "reward": 3.9275143146514893, + "reward_std": 0.2858148217201233, + "rewards/reward_fn/mean": 3.9275143146514893, + "rewards/reward_fn/std": 0.2858148217201233, + "step": 3140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 71.75, + "completions/mean_terminated_length": 71.75, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.36274396581591406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.0037565069733318524, + "learning_rate": 1.7199999999999998e-06, + "loss": 0.0002, + "num_tokens": 72956948.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 57.75, + "completions/mean_terminated_length": 57.75, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.36285945259267816, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.421875, + "kl": 0.007864866216550581, + "learning_rate": 1.718e-06, + "loss": 0.0003, + "num_tokens": 72980844.0, + "reward": 3.6768460273742676, + "reward_std": 0.03315625712275505, + "rewards/reward_fn/mean": 3.6768460273742676, + "rewards/reward_fn/std": 0.03315630555152893, + "step": 3142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 247.46875, + "completions/mean_terminated_length": 247.46875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.3629749393694422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.017707633815007284, + "learning_rate": 1.716e-06, + "loss": 0.0007, + "num_tokens": 73011611.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 145.125, + "completions/mean_terminated_length": 145.125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.36309042614620624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.011894763068994507, + "learning_rate": 1.7139999999999998e-06, + "loss": 0.0005, + "num_tokens": 73038239.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 284.53125, + "completions/mean_terminated_length": 284.53125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.36320591292297033, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.015550115131190978, + "learning_rate": 1.7119999999999999e-06, + "loss": 0.0006, + "num_tokens": 73065520.0, + "reward": 3.6229114532470703, + "reward_std": 0.5712776184082031, + "rewards/reward_fn/mean": 3.6229114532470703, + "rewards/reward_fn/std": 0.5712776184082031, + "step": 3145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 228.5625, + "completions/mean_terminated_length": 228.5625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.36332139969973437, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.013133109721820801, + "learning_rate": 1.71e-06, + "loss": 0.0005, + "num_tokens": 73092674.0, + "reward": 3.8415207862854004, + "reward_std": 0.4464723467826843, + "rewards/reward_fn/mean": 3.8415207862854004, + "rewards/reward_fn/std": 0.4464723765850067, + "step": 3146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 159.8125, + "completions/mean_terminated_length": 159.8125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.36343688647649847, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.006812120052927639, + "learning_rate": 1.7079999999999998e-06, + "loss": 0.0003, + "num_tokens": 73114780.0, + "reward": 3.0187277793884277, + "reward_std": 0.04884805157780647, + "rewards/reward_fn/mean": 3.0187277793884277, + "rewards/reward_fn/std": 0.04884805157780647, + "step": 3147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 422.53125, + "completions/mean_terminated_length": 422.53125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.3635523732532625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.009799024206586182, + "learning_rate": 1.7059999999999998e-06, + "loss": 0.0004, + "num_tokens": 73143213.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 332.0, + "completions/mean_terminated_length": 332.0, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.36366786003002655, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.01040554724750109, + "learning_rate": 1.704e-06, + "loss": 0.0004, + "num_tokens": 73165101.0, + "reward": 3.6459643840789795, + "reward_std": 0.8360132575035095, + "rewards/reward_fn/mean": 3.6459643840789795, + "rewards/reward_fn/std": 0.8360131978988647, + "step": 3149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 363.84375, + "completions/mean_terminated_length": 363.84375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.36378334680679064, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.010603518749121577, + "learning_rate": 1.702e-06, + "loss": 0.0004, + "num_tokens": 73199656.0, + "reward": 2.893897294998169, + "reward_std": 0.5565817952156067, + "rewards/reward_fn/mean": 2.893897294998169, + "rewards/reward_fn/std": 0.5565817952156067, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 145.15625, + "completions/mean_terminated_length": 145.15625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.3638988335835547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.011106204896350391, + "learning_rate": 1.6999999999999998e-06, + "loss": 0.0004, + "num_tokens": 73218541.0, + "reward": 3.9097278118133545, + "reward_std": 0.28645822405815125, + "rewards/reward_fn/mean": 3.9097278118133545, + "rewards/reward_fn/std": 0.28645819425582886, + "step": 3151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 99.34375, + "completions/mean_terminated_length": 99.34375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.3640143203603187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638671875, + "kl": 0.0037446960177476285, + "learning_rate": 1.6979999999999999e-06, + "loss": 0.0001, + "num_tokens": 73235320.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 70.28125, + "completions/mean_terminated_length": 70.28125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.3641298071370828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.02261401922442019, + "learning_rate": 1.696e-06, + "loss": 0.0009, + "num_tokens": 73263841.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 202.1875, + "completions/mean_terminated_length": 202.1875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.36424529391384686, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.0092880845113541, + "learning_rate": 1.6939999999999998e-06, + "loss": 0.0004, + "num_tokens": 73299815.0, + "reward": 3.901219129562378, + "reward_std": 0.4171420931816101, + "rewards/reward_fn/mean": 3.901219129562378, + "rewards/reward_fn/std": 0.4171420633792877, + "step": 3154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 216.5625, + "completions/mean_terminated_length": 216.5625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.36436078069061095, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.014043276794836856, + "learning_rate": 1.6919999999999999e-06, + "loss": 0.0006, + "num_tokens": 73314809.0, + "reward": 3.9041202068328857, + "reward_std": 0.41835853457450867, + "rewards/reward_fn/mean": 3.9041202068328857, + "rewards/reward_fn/std": 0.41835853457450867, + "step": 3155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 95.46875, + "completions/mean_terminated_length": 95.46875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.364476267467375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12451171875, + "kl": 0.01310915638168808, + "learning_rate": 1.69e-06, + "loss": 0.0005, + "num_tokens": 73341864.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 97.59375, + "completions/mean_terminated_length": 97.59375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.36459175424413903, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.007212130207335576, + "learning_rate": 1.6879999999999998e-06, + "loss": 0.0003, + "num_tokens": 73358491.0, + "reward": 3.36690616607666, + "reward_std": 0.13070282340049744, + "rewards/reward_fn/mean": 3.36690616607666, + "rewards/reward_fn/std": 0.1307028830051422, + "step": 3157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 84.6875, + "completions/mean_terminated_length": 84.6875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.36470724102090313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27734375, + "kl": 0.017632295021030586, + "learning_rate": 1.6859999999999998e-06, + "loss": 0.0007, + "num_tokens": 73378289.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 200.5, + "completions/mean_terminated_length": 200.5, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.36482272779766717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.012633941769308876, + "learning_rate": 1.6839999999999999e-06, + "loss": 0.0005, + "num_tokens": 73409537.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 185.71875, + "completions/mean_terminated_length": 185.71875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.3649382145744312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.008695208474819083, + "learning_rate": 1.682e-06, + "loss": 0.0003, + "num_tokens": 73428216.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 87.90625, + "completions/mean_terminated_length": 87.90625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.3650537013511953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.006944988315808587, + "learning_rate": 1.6799999999999998e-06, + "loss": 0.0003, + "num_tokens": 73447637.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 113.625, + "completions/mean_terminated_length": 113.625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.36516918812795934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.007049946692859521, + "learning_rate": 1.6779999999999999e-06, + "loss": 0.0003, + "num_tokens": 73463753.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 76.34375, + "completions/mean_terminated_length": 76.34375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.3652846749047234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.003189392175499961, + "learning_rate": 1.676e-06, + "loss": 0.0001, + "num_tokens": 73484980.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 187.375, + "completions/mean_terminated_length": 187.375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.3654001616814875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.010517780057853088, + "learning_rate": 1.6739999999999998e-06, + "loss": 0.0004, + "num_tokens": 73511200.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 83.40625, + "completions/mean_terminated_length": 83.40625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.3655156484582515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.011056619441660587, + "learning_rate": 1.6719999999999998e-06, + "loss": 0.0004, + "num_tokens": 73524109.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 140.1875, + "completions/mean_terminated_length": 140.1875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.3656311352350156, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.009745741219376214, + "learning_rate": 1.6699999999999999e-06, + "loss": 0.0004, + "num_tokens": 73539507.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 87.21875, + "completions/mean_terminated_length": 87.21875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.36574662201177965, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.008505272980983136, + "learning_rate": 1.668e-06, + "loss": 0.0003, + "num_tokens": 73550234.0, + "reward": 3.9330055713653564, + "reward_std": 0.37897759675979614, + "rewards/reward_fn/mean": 3.9330055713653564, + "rewards/reward_fn/std": 0.37897759675979614, + "step": 3167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 280.0, + "completions/mean_terminated_length": 280.0, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.3658621087885437, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.012630769648239948, + "learning_rate": 1.6659999999999998e-06, + "loss": 0.0005, + "num_tokens": 73572122.0, + "reward": 2.931549072265625, + "reward_std": 0.24488778412342072, + "rewards/reward_fn/mean": 2.931549072265625, + "rewards/reward_fn/std": 0.24488775432109833, + "step": 3168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 178.40625, + "completions/mean_terminated_length": 178.40625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.3659775955653078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.013901818878366612, + "learning_rate": 1.6639999999999999e-06, + "loss": 0.0006, + "num_tokens": 73602695.0, + "reward": 3.966860771179199, + "reward_std": 0.1356477290391922, + "rewards/reward_fn/mean": 3.966860771179199, + "rewards/reward_fn/std": 0.1356477588415146, + "step": 3169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 174.65625, + "completions/mean_terminated_length": 174.65625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.36609308234207183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.006581671674211975, + "learning_rate": 1.662e-06, + "loss": 0.0003, + "num_tokens": 73624316.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 176.25, + "completions/mean_terminated_length": 176.25, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.36620856911883587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.007236280951474328, + "learning_rate": 1.6599999999999998e-06, + "loss": 0.0003, + "num_tokens": 73652228.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 188.78125, + "completions/mean_terminated_length": 188.78125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.36632405589559996, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.014640467677963898, + "learning_rate": 1.6579999999999998e-06, + "loss": 0.0006, + "num_tokens": 73670429.0, + "reward": 3.085757255554199, + "reward_std": 0.4129517376422882, + "rewards/reward_fn/mean": 3.085757255554199, + "rewards/reward_fn/std": 0.41295167803764343, + "step": 3172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 174.625, + "completions/mean_terminated_length": 174.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.366439542672364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.00371534600162704, + "learning_rate": 1.656e-06, + "loss": 0.0001, + "num_tokens": 73698801.0, + "reward": 3.9324493408203125, + "reward_std": 0.38212472200393677, + "rewards/reward_fn/mean": 3.9324493408203125, + "rewards/reward_fn/std": 0.382124662399292, + "step": 3173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 200.375, + "completions/mean_terminated_length": 200.375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.3665550294491281, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.009570256988808978, + "learning_rate": 1.6539999999999997e-06, + "loss": 0.0004, + "num_tokens": 73722045.0, + "reward": 3.547865867614746, + "reward_std": 0.4911636412143707, + "rewards/reward_fn/mean": 3.547865867614746, + "rewards/reward_fn/std": 0.49116355180740356, + "step": 3174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 125.71875, + "completions/mean_terminated_length": 125.71875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.36667051622589214, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.00883681999403052, + "learning_rate": 1.6519999999999998e-06, + "loss": 0.0004, + "num_tokens": 73744820.0, + "reward": 3.931014060974121, + "reward_std": 0.3902437686920166, + "rewards/reward_fn/mean": 3.931014060974121, + "rewards/reward_fn/std": 0.3902437686920166, + "step": 3175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 98.03125, + "completions/mean_terminated_length": 98.03125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.3667860030026562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05126953125, + "kl": 0.006429537672374863, + "learning_rate": 1.6499999999999999e-06, + "loss": 0.0003, + "num_tokens": 73760117.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 180.78125, + "completions/mean_terminated_length": 180.78125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.3669014897794203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.013338894925254863, + "learning_rate": 1.648e-06, + "loss": 0.0005, + "num_tokens": 73781326.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 126.40625, + "completions/mean_terminated_length": 126.40625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.3670169765561843, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.009390301922394428, + "learning_rate": 1.6459999999999998e-06, + "loss": 0.0004, + "num_tokens": 73797371.0, + "reward": 3.951094150543213, + "reward_std": 0.1925094723701477, + "rewards/reward_fn/mean": 3.951094150543213, + "rewards/reward_fn/std": 0.19250944256782532, + "step": 3178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 369.5625, + "completions/mean_terminated_length": 369.5625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.36713246333294836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0546875, + "kl": 0.013996974797919393, + "learning_rate": 1.6439999999999998e-06, + "loss": 0.0006, + "num_tokens": 73824333.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 276.03125, + "completions/mean_terminated_length": 276.03125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.36724795010971245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044921875, + "kl": 0.008345533286046702, + "learning_rate": 1.642e-06, + "loss": 0.0003, + "num_tokens": 73847630.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 181.4375, + "completions/mean_terminated_length": 181.4375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.3673634368864765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.0073472558506182395, + "learning_rate": 1.6399999999999998e-06, + "loss": 0.0003, + "num_tokens": 73865340.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 145.4375, + "completions/mean_terminated_length": 145.4375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.3674789236632406, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.026772903132950887, + "learning_rate": 1.6379999999999998e-06, + "loss": 0.0011, + "num_tokens": 73886986.0, + "reward": 3.19417142868042, + "reward_std": 0.3314938247203827, + "rewards/reward_fn/mean": 3.19417142868042, + "rewards/reward_fn/std": 0.3314938247203827, + "step": 3182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 113.0625, + "completions/mean_terminated_length": 113.0625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.3675944104400046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.006126337437308393, + "learning_rate": 1.6359999999999999e-06, + "loss": 0.0002, + "num_tokens": 73915884.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 296.75, + "completions/mean_terminated_length": 296.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.36770989721676867, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.00991182703000959, + "learning_rate": 1.6339999999999997e-06, + "loss": 0.0004, + "num_tokens": 73944964.0, + "reward": 2.802091598510742, + "reward_std": 0.07012838125228882, + "rewards/reward_fn/mean": 2.802091598510742, + "rewards/reward_fn/std": 0.0701284408569336, + "step": 3184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 128.71875, + "completions/mean_terminated_length": 128.71875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.36782538399353276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.009079880386707373, + "learning_rate": 1.6319999999999998e-06, + "loss": 0.0004, + "num_tokens": 73961691.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 209.21875, + "completions/mean_terminated_length": 209.21875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3679408707702968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0400390625, + "kl": 0.006981980382988695, + "learning_rate": 1.6299999999999999e-06, + "loss": 0.0003, + "num_tokens": 73986306.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 59.5, + "completions/mean_terminated_length": 59.5, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.36805635754706084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.185546875, + "kl": 0.01331198051411775, + "learning_rate": 1.628e-06, + "loss": 0.0005, + "num_tokens": 74010610.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 147.40625, + "completions/mean_terminated_length": 147.40625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.36817184432382494, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.007408267898426857, + "learning_rate": 1.6259999999999998e-06, + "loss": 0.0003, + "num_tokens": 74040703.0, + "reward": 3.7672481536865234, + "reward_std": 0.4470231533050537, + "rewards/reward_fn/mean": 3.7672481536865234, + "rewards/reward_fn/std": 0.4470231235027313, + "step": 3188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 188.5, + "completions/mean_terminated_length": 188.5, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.368287331100589, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.01669287415279541, + "learning_rate": 1.624e-06, + "loss": 0.0007, + "num_tokens": 74059631.0, + "reward": 3.9766135215759277, + "reward_std": 0.13229383528232574, + "rewards/reward_fn/mean": 3.9766135215759277, + "rewards/reward_fn/std": 0.13229382038116455, + "step": 3189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 203.40625, + "completions/mean_terminated_length": 203.40625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.368402817877353, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.01023837622415158, + "learning_rate": 1.622e-06, + "loss": 0.0004, + "num_tokens": 74088988.0, + "reward": 3.587146282196045, + "reward_std": 0.8732724785804749, + "rewards/reward_fn/mean": 3.587146282196045, + "rewards/reward_fn/std": 0.8732723593711853, + "step": 3190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 259.65625, + "completions/mean_terminated_length": 259.65625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.3685183046541171, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.007789491646690294, + "learning_rate": 1.62e-06, + "loss": 0.0003, + "num_tokens": 74110065.0, + "reward": 3.978320598602295, + "reward_std": 0.1226368248462677, + "rewards/reward_fn/mean": 3.978320598602295, + "rewards/reward_fn/std": 0.12263678759336472, + "step": 3191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 284.21875, + "completions/mean_terminated_length": 284.21875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.36863379143088115, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.01127834884391632, + "learning_rate": 1.618e-06, + "loss": 0.0005, + "num_tokens": 74141208.0, + "reward": 3.717515468597412, + "reward_std": 0.43806686997413635, + "rewards/reward_fn/mean": 3.717515468597412, + "rewards/reward_fn/std": 0.4380668103694916, + "step": 3192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1253.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 410.5625, + "completions/mean_terminated_length": 410.5625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.36874927820764525, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.012326649943133816, + "learning_rate": 1.616e-06, + "loss": 0.0005, + "num_tokens": 74162890.0, + "reward": 3.186025619506836, + "reward_std": 0.32317402958869934, + "rewards/reward_fn/mean": 3.186025619506836, + "rewards/reward_fn/std": 0.32317399978637695, + "step": 3193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 101.28125, + "completions/mean_terminated_length": 101.28125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.3688647649844093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.007119164067262318, + "learning_rate": 1.6140000000000001e-06, + "loss": 0.0003, + "num_tokens": 74191315.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 80.5, + "completions/mean_terminated_length": 80.5, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.36898025176117333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.00735476737463614, + "learning_rate": 1.612e-06, + "loss": 0.0003, + "num_tokens": 74212483.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 59.125, + "completions/mean_terminated_length": 59.125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.3690957385379374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.006882053523440845, + "learning_rate": 1.61e-06, + "loss": 0.0003, + "num_tokens": 74226215.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 321.9375, + "completions/mean_terminated_length": 321.9375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.36921122531470146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.010929904172371607, + "learning_rate": 1.608e-06, + "loss": 0.0004, + "num_tokens": 74252837.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 149.46875, + "completions/mean_terminated_length": 149.46875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.3693267120914655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.005881161872821394, + "learning_rate": 1.606e-06, + "loss": 0.0002, + "num_tokens": 74277780.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 91.71875, + "completions/mean_terminated_length": 91.71875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3694421988682296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.008670190061820904, + "learning_rate": 1.604e-06, + "loss": 0.0003, + "num_tokens": 74303755.0, + "reward": 3.929748296737671, + "reward_std": 0.27648335695266724, + "rewards/reward_fn/mean": 3.929748296737671, + "rewards/reward_fn/std": 0.2764833867549896, + "step": 3199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 204.90625, + "completions/mean_terminated_length": 204.90625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.36955768564499364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.009071725500689354, + "learning_rate": 1.602e-06, + "loss": 0.0004, + "num_tokens": 74328520.0, + "reward": 3.909743309020996, + "reward_std": 0.28532108664512634, + "rewards/reward_fn/mean": 3.909743309020996, + "rewards/reward_fn/std": 0.28532111644744873, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 108.5625, + "completions/mean_terminated_length": 108.5625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.36967317242175773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.007224757660878822, + "learning_rate": 1.6e-06, + "loss": 0.0003, + "num_tokens": 74348954.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 72.5, + "completions/mean_terminated_length": 72.5, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.3697886591985218, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042236328125, + "kl": 0.0034754131447698455, + "learning_rate": 1.598e-06, + "loss": 0.0001, + "num_tokens": 74368266.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 183.9375, + "completions/mean_terminated_length": 183.9375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.3699041459752858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.014391868186066858, + "learning_rate": 1.596e-06, + "loss": 0.0006, + "num_tokens": 74388456.0, + "reward": 3.969475746154785, + "reward_std": 0.17267143726348877, + "rewards/reward_fn/mean": 3.969475746154785, + "rewards/reward_fn/std": 0.17267146706581116, + "step": 3203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.3700196327520499, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.010860609196242876, + "learning_rate": 1.5940000000000001e-06, + "loss": 0.0004, + "num_tokens": 74413541.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 294.0625, + "completions/mean_terminated_length": 294.0625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.37013511952881395, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.009571213944582269, + "learning_rate": 1.592e-06, + "loss": 0.0004, + "num_tokens": 74439111.0, + "reward": 3.931560516357422, + "reward_std": 0.3871529996395111, + "rewards/reward_fn/mean": 3.931560516357422, + "rewards/reward_fn/std": 0.3871529698371887, + "step": 3205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 104.9375, + "completions/mean_terminated_length": 104.9375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.370250606305578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.010939283754851203, + "learning_rate": 1.59e-06, + "loss": 0.0004, + "num_tokens": 74466309.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 100.59375, + "completions/mean_terminated_length": 100.59375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.3703660930823421, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.00665222905081464, + "learning_rate": 1.588e-06, + "loss": 0.0003, + "num_tokens": 74489208.0, + "reward": 3.494692802429199, + "reward_std": 0.06691040098667145, + "rewards/reward_fn/mean": 3.494692802429199, + "rewards/reward_fn/std": 0.06691040098667145, + "step": 3207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 292.0, + "completions/mean_terminated_length": 292.0, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.3704815798591061, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.009489545482210815, + "learning_rate": 1.586e-06, + "loss": 0.0004, + "num_tokens": 74511768.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 110.65625, + "completions/mean_terminated_length": 110.65625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.3705970666358702, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.0038332797066686908, + "learning_rate": 1.584e-06, + "loss": 0.0002, + "num_tokens": 74536717.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 509.34375, + "completions/mean_terminated_length": 509.34375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.37071255341263426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.010702057392336428, + "learning_rate": 1.582e-06, + "loss": 0.0004, + "num_tokens": 74568408.0, + "reward": 2.5844883918762207, + "reward_std": 0.24132712185382843, + "rewards/reward_fn/mean": 2.5844883918762207, + "rewards/reward_fn/std": 0.24132713675498962, + "step": 3210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 82.21875, + "completions/mean_terminated_length": 82.21875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.3708280401893983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17578125, + "kl": 0.01224532643573184, + "learning_rate": 1.58e-06, + "loss": 0.0005, + "num_tokens": 74586847.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 377.3125, + "completions/mean_terminated_length": 377.3125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.3709435269661624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.011704895907314494, + "learning_rate": 1.578e-06, + "loss": 0.0005, + "num_tokens": 74611849.0, + "reward": 3.6397252082824707, + "reward_std": 0.800005316734314, + "rewards/reward_fn/mean": 3.6397252082824707, + "rewards/reward_fn/std": 0.8000052571296692, + "step": 3212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 257.21875, + "completions/mean_terminated_length": 257.21875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.37105901374292644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.009298241202486679, + "learning_rate": 1.576e-06, + "loss": 0.0004, + "num_tokens": 74632656.0, + "reward": 3.021463394165039, + "reward_std": 0.5267957448959351, + "rewards/reward_fn/mean": 3.021463394165039, + "rewards/reward_fn/std": 0.5267957448959351, + "step": 3213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 80.28125, + "completions/mean_terminated_length": 80.28125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.3711745005196905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.007029809828964062, + "learning_rate": 1.574e-06, + "loss": 0.0003, + "num_tokens": 74664345.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 200.15625, + "completions/mean_terminated_length": 200.15625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.37128998729645457, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.011290883150650188, + "learning_rate": 1.572e-06, + "loss": 0.0005, + "num_tokens": 74692926.0, + "reward": 3.090942144393921, + "reward_std": 0.06344287842512131, + "rewards/reward_fn/mean": 3.090942144393921, + "rewards/reward_fn/std": 0.06344287097454071, + "step": 3215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 97.03125, + "completions/mean_terminated_length": 97.03125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.3714054740732186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.0180619263992412, + "learning_rate": 1.57e-06, + "loss": 0.0007, + "num_tokens": 74717087.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 254.25, + "completions/mean_terminated_length": 254.25, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.37152096084998265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0341796875, + "kl": 0.00591256417465047, + "learning_rate": 1.568e-06, + "loss": 0.0002, + "num_tokens": 74737959.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 74.0, + "completions/max_terminated_length": 74.0, + "completions/mean_length": 57.71875, + "completions/mean_terminated_length": 57.71875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.37163644762674675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19921875, + "kl": 0.013560307095758617, + "learning_rate": 1.566e-06, + "loss": 0.0005, + "num_tokens": 74753054.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 153.0625, + "completions/mean_terminated_length": 153.0625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.3717519344035108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0576171875, + "kl": 0.008659151470055804, + "learning_rate": 1.564e-06, + "loss": 0.0003, + "num_tokens": 74770016.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 88.71875, + "completions/mean_terminated_length": 88.71875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3718674211802749, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.013545121313654818, + "learning_rate": 1.562e-06, + "loss": 0.0005, + "num_tokens": 74787735.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 584.125, + "completions/mean_terminated_length": 536.9031982421875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.3719829079570389, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.011024560648365878, + "learning_rate": 1.5599999999999999e-06, + "loss": 0.0004, + "num_tokens": 74828539.0, + "reward": 2.586242437362671, + "reward_std": 0.6071089506149292, + "rewards/reward_fn/mean": 2.586242437362671, + "rewards/reward_fn/std": 0.6071089506149292, + "step": 3221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 150.0625, + "completions/mean_terminated_length": 150.0625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.37209839473380296, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.008182945442968048, + "learning_rate": 1.558e-06, + "loss": 0.0003, + "num_tokens": 74844157.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 66.40625, + "completions/mean_terminated_length": 66.40625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.37221388151056706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.006069362420021207, + "learning_rate": 1.556e-06, + "loss": 0.0002, + "num_tokens": 74869290.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 155.375, + "completions/mean_terminated_length": 155.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.3723293682873311, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.00819100630178582, + "learning_rate": 1.554e-06, + "loss": 0.0003, + "num_tokens": 74896374.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 249.21875, + "completions/mean_terminated_length": 249.21875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.37244485506409514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.007829940259398427, + "learning_rate": 1.552e-06, + "loss": 0.0003, + "num_tokens": 74919485.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 54.4375, + "completions/mean_terminated_length": 54.4375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.37256034184085923, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6875, + "kl": 0.010902377533057006, + "learning_rate": 1.55e-06, + "loss": 0.0004, + "num_tokens": 74933867.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 113.875, + "completions/mean_terminated_length": 113.875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3726758286176233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.004809475014553755, + "learning_rate": 1.548e-06, + "loss": 0.0002, + "num_tokens": 74959207.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 91.0, + "completions/max_terminated_length": 91.0, + "completions/mean_length": 56.25, + "completions/mean_terminated_length": 56.25, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.37279131539438737, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.125, + "kl": 0.028501552718807943, + "learning_rate": 1.5459999999999999e-06, + "loss": 0.0011, + "num_tokens": 74979119.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 229.65625, + "completions/mean_terminated_length": 229.65625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.3729068021711514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.006486557795142289, + "learning_rate": 1.544e-06, + "loss": 0.0003, + "num_tokens": 75009668.0, + "reward": 3.207226037979126, + "reward_std": 0.466844379901886, + "rewards/reward_fn/mean": 3.207226037979126, + "rewards/reward_fn/std": 0.4668444097042084, + "step": 3229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 71.375, + "completions/mean_terminated_length": 71.375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.37302228894791545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.006516888508485863, + "learning_rate": 1.542e-06, + "loss": 0.0003, + "num_tokens": 75032720.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 92.375, + "completions/mean_terminated_length": 92.375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.37313777572467954, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.011776424085837789, + "learning_rate": 1.5399999999999999e-06, + "loss": 0.0005, + "num_tokens": 75054364.0, + "reward": 3.0907275676727295, + "reward_std": 0.24124805629253387, + "rewards/reward_fn/mean": 3.0907275676727295, + "rewards/reward_fn/std": 0.24124804139137268, + "step": 3231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 184.5625, + "completions/mean_terminated_length": 184.5625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.3732532625014436, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.010387043032096699, + "learning_rate": 1.538e-06, + "loss": 0.0004, + "num_tokens": 75084590.0, + "reward": 3.969956159591675, + "reward_std": 0.16995397210121155, + "rewards/reward_fn/mean": 3.969956159591675, + "rewards/reward_fn/std": 0.16995400190353394, + "step": 3232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 254.71875, + "completions/mean_terminated_length": 254.71875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3733687492782076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.011178489527083002, + "learning_rate": 1.536e-06, + "loss": 0.0004, + "num_tokens": 75116645.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 218.96875, + "completions/mean_terminated_length": 218.96875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.3734842360549717, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.009181443987472448, + "learning_rate": 1.534e-06, + "loss": 0.0004, + "num_tokens": 75138852.0, + "reward": 3.9302749633789062, + "reward_std": 0.3944237530231476, + "rewards/reward_fn/mean": 3.9302749633789062, + "rewards/reward_fn/std": 0.39442378282546997, + "step": 3234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 223.40625, + "completions/mean_terminated_length": 223.40625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.37359972283173576, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.01221833388262894, + "learning_rate": 1.532e-06, + "loss": 0.0005, + "num_tokens": 75170545.0, + "reward": 2.936990737915039, + "reward_std": 0.05098772794008255, + "rewards/reward_fn/mean": 2.936990737915039, + "rewards/reward_fn/std": 0.05098772794008255, + "step": 3235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 232.875, + "completions/mean_terminated_length": 232.875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.37371520960849985, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.009862378043180797, + "learning_rate": 1.53e-06, + "loss": 0.0004, + "num_tokens": 75197389.0, + "reward": 3.9279494285583496, + "reward_std": 0.40757888555526733, + "rewards/reward_fn/mean": 3.9279494285583496, + "rewards/reward_fn/std": 0.4075789153575897, + "step": 3236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 147.78125, + "completions/mean_terminated_length": 147.78125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.3738306963852639, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.59375, + "kl": 0.007494337303796783, + "learning_rate": 1.528e-06, + "loss": 0.0003, + "num_tokens": 75221094.0, + "reward": 3.9533748626708984, + "reward_std": 0.18346816301345825, + "rewards/reward_fn/mean": 3.9533748626708984, + "rewards/reward_fn/std": 0.18346813321113586, + "step": 3237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 315.84375, + "completions/mean_terminated_length": 315.84375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.37394618316202793, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.012794995156582445, + "learning_rate": 1.5259999999999999e-06, + "loss": 0.0005, + "num_tokens": 75255937.0, + "reward": 3.048921823501587, + "reward_std": 1.0958207845687866, + "rewards/reward_fn/mean": 3.048921823501587, + "rewards/reward_fn/std": 1.095820665359497, + "step": 3238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 177.5, + "completions/mean_terminated_length": 177.5, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.37406166993879203, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.02792708385095466, + "learning_rate": 1.524e-06, + "loss": 0.0011, + "num_tokens": 75284625.0, + "reward": 3.0292797088623047, + "reward_std": 0.1303415596485138, + "rewards/reward_fn/mean": 3.0292797088623047, + "rewards/reward_fn/std": 0.13034158945083618, + "step": 3239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 188.125, + "completions/mean_terminated_length": 188.125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.37417715671555607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.013345544764888473, + "learning_rate": 1.522e-06, + "loss": 0.0005, + "num_tokens": 75302933.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 76.6875, + "completions/mean_terminated_length": 76.6875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.3742926434923201, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.00426785016861686, + "learning_rate": 1.5199999999999998e-06, + "loss": 0.0002, + "num_tokens": 75326443.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 240.90625, + "completions/mean_terminated_length": 240.90625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.3744081302690842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.008135039308399428, + "learning_rate": 1.518e-06, + "loss": 0.0003, + "num_tokens": 75349512.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 143.28125, + "completions/mean_terminated_length": 143.28125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.37452361704584824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1787109375, + "kl": 0.01221661051386036, + "learning_rate": 1.516e-06, + "loss": 0.0005, + "num_tokens": 75371697.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 152.15625, + "completions/mean_terminated_length": 152.15625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.3746391038226123, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.009547617883072235, + "learning_rate": 1.514e-06, + "loss": 0.0004, + "num_tokens": 75407030.0, + "reward": 3.932006359100342, + "reward_std": 0.3846307098865509, + "rewards/reward_fn/mean": 3.932006359100342, + "rewards/reward_fn/std": 0.3846306800842285, + "step": 3244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 95.28125, + "completions/mean_terminated_length": 95.28125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.3747545905993764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.002960454028652748, + "learning_rate": 1.5119999999999999e-06, + "loss": 0.0001, + "num_tokens": 75434591.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 393.125, + "completions/mean_terminated_length": 339.7419128417969, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.3748700773761404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.59765625, + "kl": 0.008494781846820842, + "learning_rate": 1.51e-06, + "loss": 0.0003, + "num_tokens": 75462467.0, + "reward": 3.7997541427612305, + "reward_std": 0.8134886622428894, + "rewards/reward_fn/mean": 3.7997541427612305, + "rewards/reward_fn/std": 0.8134886026382446, + "step": 3246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 291.9375, + "completions/mean_terminated_length": 291.9375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.3749855641529045, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.008332013014296535, + "learning_rate": 1.508e-06, + "loss": 0.0003, + "num_tokens": 75489281.0, + "reward": 3.8337323665618896, + "reward_std": 0.40002715587615967, + "rewards/reward_fn/mean": 3.8337323665618896, + "rewards/reward_fn/std": 0.40002715587615967, + "step": 3247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 204.28125, + "completions/mean_terminated_length": 204.28125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.37510105092966856, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.017047958535840735, + "learning_rate": 1.5059999999999999e-06, + "loss": 0.0007, + "num_tokens": 75518026.0, + "reward": 3.2558507919311523, + "reward_std": 0.47158193588256836, + "rewards/reward_fn/mean": 3.2558507919311523, + "rewards/reward_fn/std": 0.47158190608024597, + "step": 3248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 66.5, + "completions/mean_terminated_length": 66.5, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.3752165377064326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.00570001153027988, + "learning_rate": 1.504e-06, + "loss": 0.0002, + "num_tokens": 75541306.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 324.4375, + "completions/mean_terminated_length": 324.4375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.3753320244831967, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.009190464319544844, + "learning_rate": 1.502e-06, + "loss": 0.0004, + "num_tokens": 75571560.0, + "reward": 3.818847179412842, + "reward_std": 0.4876691401004791, + "rewards/reward_fn/mean": 3.818847179412842, + "rewards/reward_fn/std": 0.4876691401004791, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 249.0625, + "completions/mean_terminated_length": 249.0625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.37544751125996073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.011274489443167113, + "learning_rate": 1.5e-06, + "loss": 0.0005, + "num_tokens": 75601706.0, + "reward": 2.8138680458068848, + "reward_std": 0.41351720690727234, + "rewards/reward_fn/mean": 2.8138680458068848, + "rewards/reward_fn/std": 0.41351717710494995, + "step": 3251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 442.0625, + "completions/mean_terminated_length": 442.0625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.37556299803672477, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.01115893638052512, + "learning_rate": 1.4979999999999999e-06, + "loss": 0.0004, + "num_tokens": 75627884.0, + "reward": 3.3014769554138184, + "reward_std": 1.0526853799819946, + "rewards/reward_fn/mean": 3.3014769554138184, + "rewards/reward_fn/std": 1.0526853799819946, + "step": 3252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 181.1875, + "completions/mean_terminated_length": 181.1875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.37567848481348887, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.007170024480728898, + "learning_rate": 1.496e-06, + "loss": 0.0003, + "num_tokens": 75654738.0, + "reward": 3.94319224357605, + "reward_std": 0.22389854490756989, + "rewards/reward_fn/mean": 3.94319224357605, + "rewards/reward_fn/std": 0.2238985151052475, + "step": 3253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 127.15625, + "completions/mean_terminated_length": 127.15625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.3757939715902529, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.013871297749574296, + "learning_rate": 1.494e-06, + "loss": 0.0006, + "num_tokens": 75676983.0, + "reward": 3.934171676635742, + "reward_std": 0.2592127025127411, + "rewards/reward_fn/mean": 3.934171676635742, + "rewards/reward_fn/std": 0.2592127323150635, + "step": 3254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 420.875, + "completions/mean_terminated_length": 420.875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.375909458367017, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.013805220776703209, + "learning_rate": 1.4919999999999999e-06, + "loss": 0.0006, + "num_tokens": 75702675.0, + "reward": 3.7153358459472656, + "reward_std": 0.7652637362480164, + "rewards/reward_fn/mean": 3.7153358459472656, + "rewards/reward_fn/std": 0.7652637362480164, + "step": 3255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 340.3125, + "completions/mean_terminated_length": 340.3125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.37602494514378104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.01047501529683359, + "learning_rate": 1.49e-06, + "loss": 0.0004, + "num_tokens": 75736605.0, + "reward": 3.832148790359497, + "reward_std": 0.5668955445289612, + "rewards/reward_fn/mean": 3.832148790359497, + "rewards/reward_fn/std": 0.5668955445289612, + "step": 3256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 87.5, + "completions/mean_terminated_length": 87.5, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.3761404319205451, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.46875, + "kl": 0.008669969145557843, + "learning_rate": 1.488e-06, + "loss": 0.0003, + "num_tokens": 75757901.0, + "reward": 3.668288230895996, + "reward_std": 0.7831881046295166, + "rewards/reward_fn/mean": 3.668288230895996, + "rewards/reward_fn/std": 0.7831880450248718, + "step": 3257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 256.53125, + "completions/mean_terminated_length": 256.53125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.3762559186973092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038818359375, + "kl": 0.007256615805090405, + "learning_rate": 1.4859999999999998e-06, + "loss": 0.0003, + "num_tokens": 75780862.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 209.46875, + "completions/mean_terminated_length": 209.46875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.3763714054740732, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.017362103651976213, + "learning_rate": 1.484e-06, + "loss": 0.0007, + "num_tokens": 75807373.0, + "reward": 2.945754289627075, + "reward_std": 0.7100226283073425, + "rewards/reward_fn/mean": 2.945754289627075, + "rewards/reward_fn/std": 0.7100225687026978, + "step": 3259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 59.03125, + "completions/mean_terminated_length": 59.03125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.37648689225083726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.009134952037129551, + "learning_rate": 1.482e-06, + "loss": 0.0004, + "num_tokens": 75831822.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 89.09375, + "completions/mean_terminated_length": 89.09375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.37660237902760135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.0036516136569844093, + "learning_rate": 1.48e-06, + "loss": 0.0001, + "num_tokens": 75854737.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 432.34375, + "completions/mean_terminated_length": 432.34375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.3767178658043654, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.012271765357581899, + "learning_rate": 1.4779999999999999e-06, + "loss": 0.0005, + "num_tokens": 75882268.0, + "reward": 3.6494200229644775, + "reward_std": 0.827787458896637, + "rewards/reward_fn/mean": 3.6494200229644775, + "rewards/reward_fn/std": 0.827787458896637, + "step": 3262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 66.8125, + "completions/mean_terminated_length": 66.8125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.3768333525811295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1748046875, + "kl": 0.014245969148760196, + "learning_rate": 1.476e-06, + "loss": 0.0006, + "num_tokens": 75906742.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 144.46875, + "completions/mean_terminated_length": 144.46875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.37694883935789353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.0112898102379404, + "learning_rate": 1.474e-06, + "loss": 0.0005, + "num_tokens": 75934725.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 185.71875, + "completions/mean_terminated_length": 185.71875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.37706432613465757, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.008814761335088406, + "learning_rate": 1.4719999999999998e-06, + "loss": 0.0004, + "num_tokens": 75965276.0, + "reward": 3.9685158729553223, + "reward_std": 0.17810073494911194, + "rewards/reward_fn/mean": 3.9685158729553223, + "rewards/reward_fn/std": 0.17810069024562836, + "step": 3265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 192.03125, + "completions/mean_terminated_length": 192.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.37717981291142166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.008331175537023228, + "learning_rate": 1.47e-06, + "loss": 0.0003, + "num_tokens": 75997661.0, + "reward": 3.2887744903564453, + "reward_std": 0.37087005376815796, + "rewards/reward_fn/mean": 3.2887744903564453, + "rewards/reward_fn/std": 0.37087005376815796, + "step": 3266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 56.78125, + "completions/mean_terminated_length": 56.78125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.3772952996881857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.007465236096322769, + "learning_rate": 1.468e-06, + "loss": 0.0003, + "num_tokens": 76016566.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 168.46875, + "completions/mean_terminated_length": 168.46875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.37741078646494974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12451171875, + "kl": 0.015982108161551878, + "learning_rate": 1.4659999999999998e-06, + "loss": 0.0006, + "num_tokens": 76038949.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 135.75, + "completions/mean_terminated_length": 135.75, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.37752627324171384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.00922201709181536, + "learning_rate": 1.4639999999999999e-06, + "loss": 0.0004, + "num_tokens": 76068317.0, + "reward": 3.7049050331115723, + "reward_std": 0.454719603061676, + "rewards/reward_fn/mean": 3.7049050331115723, + "rewards/reward_fn/std": 0.45471954345703125, + "step": 3269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 416.75, + "completions/mean_terminated_length": 364.1290283203125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.3776417600184779, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.011884218722116202, + "learning_rate": 1.462e-06, + "loss": 0.0005, + "num_tokens": 76090101.0, + "reward": 3.0273752212524414, + "reward_std": 0.7239521145820618, + "rewards/reward_fn/mean": 3.0273752212524414, + "rewards/reward_fn/std": 0.7239521145820618, + "step": 3270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 310.96875, + "completions/mean_terminated_length": 254.9354705810547, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.3777572467952419, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73828125, + "kl": 0.009107674290135037, + "learning_rate": 1.46e-06, + "loss": 0.0004, + "num_tokens": 76115764.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 410.125, + "completions/mean_terminated_length": 410.125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.377872733572006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.007808927628502715, + "learning_rate": 1.4579999999999998e-06, + "loss": 0.0003, + "num_tokens": 76138712.0, + "reward": 3.927762508392334, + "reward_std": 0.4086366891860962, + "rewards/reward_fn/mean": 3.927762508392334, + "rewards/reward_fn/std": 0.4086366891860962, + "step": 3272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 237.09375, + "completions/mean_terminated_length": 237.09375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.37798822034877005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.010625309369061142, + "learning_rate": 1.456e-06, + "loss": 0.0004, + "num_tokens": 76169819.0, + "reward": 3.0964367389678955, + "reward_std": 0.07283965498209, + "rewards/reward_fn/mean": 3.0964367389678955, + "rewards/reward_fn/std": 0.07283968478441238, + "step": 3273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 242.625, + "completions/mean_terminated_length": 242.625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.37810370712553415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.011145018273964524, + "learning_rate": 1.454e-06, + "loss": 0.0004, + "num_tokens": 76190255.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 191.03125, + "completions/mean_terminated_length": 191.03125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3782191939022982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.008353696524864063, + "learning_rate": 1.4519999999999998e-06, + "loss": 0.0003, + "num_tokens": 76208240.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 157.6875, + "completions/mean_terminated_length": 157.6875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.37833468067906223, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.013920499666710384, + "learning_rate": 1.4499999999999999e-06, + "loss": 0.0006, + "num_tokens": 76236134.0, + "reward": 3.1631648540496826, + "reward_std": 0.16479817032814026, + "rewards/reward_fn/mean": 3.1631648540496826, + "rewards/reward_fn/std": 0.16479818522930145, + "step": 3276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 244.875, + "completions/mean_terminated_length": 244.875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.3784501674558263, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.010995337652275339, + "learning_rate": 1.448e-06, + "loss": 0.0004, + "num_tokens": 76254530.0, + "reward": 3.929506540298462, + "reward_std": 0.3987712264060974, + "rewards/reward_fn/mean": 3.929506540298462, + "rewards/reward_fn/std": 0.3987712562084198, + "step": 3277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 166.875, + "completions/mean_terminated_length": 166.875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.37856565423259037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.011836282625154126, + "learning_rate": 1.4459999999999998e-06, + "loss": 0.0005, + "num_tokens": 76282398.0, + "reward": 3.962951183319092, + "reward_std": 0.2095796912908554, + "rewards/reward_fn/mean": 3.962951183319092, + "rewards/reward_fn/std": 0.2095796763896942, + "step": 3278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 250.6875, + "completions/mean_terminated_length": 250.6875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.3786811410093544, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.010460957171744667, + "learning_rate": 1.4439999999999999e-06, + "loss": 0.0004, + "num_tokens": 76305684.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 249.90625, + "completions/mean_terminated_length": 249.90625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.3787966277861185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.010084359731990844, + "learning_rate": 1.442e-06, + "loss": 0.0004, + "num_tokens": 76333233.0, + "reward": 3.916532516479492, + "reward_std": 0.2693309485912323, + "rewards/reward_fn/mean": 3.916532516479492, + "rewards/reward_fn/std": 0.2693309485912323, + "step": 3280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.37891211456288254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.008181240707926918, + "learning_rate": 1.44e-06, + "loss": 0.0003, + "num_tokens": 76366061.0, + "reward": 3.2107038497924805, + "reward_std": 0.9416924715042114, + "rewards/reward_fn/mean": 3.2107038497924805, + "rewards/reward_fn/std": 0.9416924715042114, + "step": 3281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 95.28125, + "completions/mean_terminated_length": 95.28125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.37902760133964664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.00935044225116144, + "learning_rate": 1.4379999999999998e-06, + "loss": 0.0004, + "num_tokens": 76382966.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 234.25, + "completions/mean_terminated_length": 234.25, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.3791430881164107, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.011516588667291217, + "learning_rate": 1.4359999999999999e-06, + "loss": 0.0005, + "num_tokens": 76402782.0, + "reward": 3.9295637607574463, + "reward_std": 0.3984476625919342, + "rewards/reward_fn/mean": 3.9295637607574463, + "rewards/reward_fn/std": 0.3984476625919342, + "step": 3283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.3792585748931747, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.01183983932423871, + "learning_rate": 1.434e-06, + "loss": 0.0005, + "num_tokens": 76431002.0, + "reward": 3.115452289581299, + "reward_std": 0.10094963759183884, + "rewards/reward_fn/mean": 3.115452289581299, + "rewards/reward_fn/std": 0.10094966739416122, + "step": 3284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 207.96875, + "completions/mean_terminated_length": 207.96875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.3793740616699388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.009011607078718953, + "learning_rate": 1.4319999999999998e-06, + "loss": 0.0004, + "num_tokens": 76463769.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 197.4375, + "completions/mean_terminated_length": 197.4375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.37948954844670285, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.01097677109646611, + "learning_rate": 1.4299999999999999e-06, + "loss": 0.0004, + "num_tokens": 76492807.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 156.84375, + "completions/mean_terminated_length": 156.84375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.3796050352234669, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.024214612087234855, + "learning_rate": 1.428e-06, + "loss": 0.001, + "num_tokens": 76512642.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 193.53125, + "completions/mean_terminated_length": 193.53125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.379720522000231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.007703399409365375, + "learning_rate": 1.4259999999999998e-06, + "loss": 0.0003, + "num_tokens": 76533523.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 75.84375, + "completions/mean_terminated_length": 75.84375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.379836008776995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.005398292749305256, + "learning_rate": 1.4239999999999998e-06, + "loss": 0.0002, + "num_tokens": 76547214.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 353.15625, + "completions/mean_terminated_length": 353.15625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.3799514955537591, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.014203392463969067, + "learning_rate": 1.422e-06, + "loss": 0.0006, + "num_tokens": 76579955.0, + "reward": 3.0590720176696777, + "reward_std": 0.8627829551696777, + "rewards/reward_fn/mean": 3.0590720176696777, + "rewards/reward_fn/std": 0.8627830147743225, + "step": 3290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 200.75, + "completions/mean_terminated_length": 200.75, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.38006698233052316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.013199646316934377, + "learning_rate": 1.42e-06, + "loss": 0.0005, + "num_tokens": 76596395.0, + "reward": 2.7890520095825195, + "reward_std": 0.050225816667079926, + "rewards/reward_fn/mean": 2.7890520095825195, + "rewards/reward_fn/std": 0.05022577568888664, + "step": 3291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 103.53125, + "completions/mean_terminated_length": 103.53125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.3801824691072872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.007625926329637878, + "learning_rate": 1.4179999999999998e-06, + "loss": 0.0003, + "num_tokens": 76626268.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 105.03125, + "completions/mean_terminated_length": 105.03125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.3802979558840513, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.09375, + "kl": 0.007866023770475294, + "learning_rate": 1.4159999999999999e-06, + "loss": 0.0003, + "num_tokens": 76653085.0, + "reward": 2.9906857013702393, + "reward_std": 0.03695880249142647, + "rewards/reward_fn/mean": 2.9906857013702393, + "rewards/reward_fn/std": 0.036958761513233185, + "step": 3293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 195.46875, + "completions/mean_terminated_length": 195.46875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.38041344266081534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.006634403325733729, + "learning_rate": 1.414e-06, + "loss": 0.0003, + "num_tokens": 76671724.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 340.6875, + "completions/mean_terminated_length": 340.6875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.3805289294375794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031982421875, + "kl": 0.006841682952654082, + "learning_rate": 1.4119999999999998e-06, + "loss": 0.0003, + "num_tokens": 76692418.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 170.90625, + "completions/mean_terminated_length": 170.90625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.3806444162143435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.009822107225772925, + "learning_rate": 1.4099999999999998e-06, + "loss": 0.0004, + "num_tokens": 76709151.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 137.15625, + "completions/mean_terminated_length": 137.15625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.3807599029911075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.00714944733408629, + "learning_rate": 1.408e-06, + "loss": 0.0003, + "num_tokens": 76727140.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 76.40625, + "completions/mean_terminated_length": 76.40625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.38087538976787155, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.28125, + "kl": 0.00909142709315347, + "learning_rate": 1.4059999999999998e-06, + "loss": 0.0004, + "num_tokens": 76749937.0, + "reward": 2.902263641357422, + "reward_std": 0.21682624518871307, + "rewards/reward_fn/mean": 2.902263641357422, + "rewards/reward_fn/std": 0.21682624518871307, + "step": 3298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 216.59375, + "completions/mean_terminated_length": 216.59375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.38099087654463565, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.013732968174736015, + "learning_rate": 1.4039999999999998e-06, + "loss": 0.0005, + "num_tokens": 76781764.0, + "reward": 3.9622573852539062, + "reward_std": 0.21350519359111786, + "rewards/reward_fn/mean": 3.9622573852539062, + "rewards/reward_fn/std": 0.21350522339344025, + "step": 3299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 161.46875, + "completions/mean_terminated_length": 161.46875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.3811063633213997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.010560610578977503, + "learning_rate": 1.4019999999999999e-06, + "loss": 0.0004, + "num_tokens": 76802707.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 153.6875, + "completions/mean_terminated_length": 153.6875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.3812218500981638, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.008024093076528516, + "learning_rate": 1.4e-06, + "loss": 0.0003, + "num_tokens": 76818729.0, + "reward": 3.9643306732177734, + "reward_std": 0.2017766833305359, + "rewards/reward_fn/mean": 3.9643306732177734, + "rewards/reward_fn/std": 0.2017766386270523, + "step": 3301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 130.53125, + "completions/mean_terminated_length": 130.53125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.3813373368749278, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.030963296929257922, + "learning_rate": 1.3979999999999998e-06, + "loss": 0.0012, + "num_tokens": 76844762.0, + "reward": 3.973572254180908, + "reward_std": 0.14949871599674225, + "rewards/reward_fn/mean": 3.973572254180908, + "rewards/reward_fn/std": 0.14949870109558105, + "step": 3302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 90.40625, + "completions/mean_terminated_length": 90.40625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.38145282365169186, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0625, + "kl": 0.006442773785238387, + "learning_rate": 1.3959999999999998e-06, + "loss": 0.0003, + "num_tokens": 76864743.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 181.8125, + "completions/mean_terminated_length": 181.8125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.38156831042845596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.0064851879214984365, + "learning_rate": 1.394e-06, + "loss": 0.0003, + "num_tokens": 76883105.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 226.5625, + "completions/mean_terminated_length": 226.5625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.38168379720522, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.025046496681170538, + "learning_rate": 1.3919999999999998e-06, + "loss": 0.001, + "num_tokens": 76899411.0, + "reward": 3.036637306213379, + "reward_std": 0.071506567299366, + "rewards/reward_fn/mean": 3.036637306213379, + "rewards/reward_fn/std": 0.0715065598487854, + "step": 3305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 135.09375, + "completions/mean_terminated_length": 135.09375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.38179928398198404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.008355156678589992, + "learning_rate": 1.3899999999999998e-06, + "loss": 0.0003, + "num_tokens": 76914646.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 77.0, + "completions/max_terminated_length": 77.0, + "completions/mean_length": 56.5625, + "completions/mean_terminated_length": 56.5625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.38191477075874813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1748046875, + "kl": 0.010621401346725179, + "learning_rate": 1.3879999999999999e-06, + "loss": 0.0004, + "num_tokens": 76938408.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 215.9375, + "completions/mean_terminated_length": 215.9375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.3820302575355122, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.027048256481066346, + "learning_rate": 1.3859999999999997e-06, + "loss": 0.0011, + "num_tokens": 76963206.0, + "reward": 3.9763097763061523, + "reward_std": 0.13401266932487488, + "rewards/reward_fn/mean": 3.9763097763061523, + "rewards/reward_fn/std": 0.13401265442371368, + "step": 3308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 102.28125, + "completions/mean_terminated_length": 102.28125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.38214574431227627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.00937264764070278, + "learning_rate": 1.3839999999999998e-06, + "loss": 0.0004, + "num_tokens": 76978319.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 214.78125, + "completions/mean_terminated_length": 214.78125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.3822612310890403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.010920769898802973, + "learning_rate": 1.3819999999999999e-06, + "loss": 0.0004, + "num_tokens": 77008360.0, + "reward": 3.9759511947631836, + "reward_std": 0.1360398530960083, + "rewards/reward_fn/mean": 3.9759511947631836, + "rewards/reward_fn/std": 0.1360398232936859, + "step": 3310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 116.34375, + "completions/mean_terminated_length": 116.34375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.38237671786580435, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.01055975097551709, + "learning_rate": 1.38e-06, + "loss": 0.0004, + "num_tokens": 77023827.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 318.25, + "completions/mean_terminated_length": 318.25, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.38249220464256845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.01006750353553798, + "learning_rate": 1.3779999999999998e-06, + "loss": 0.0004, + "num_tokens": 77048859.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 77.03125, + "completions/mean_terminated_length": 77.03125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.3826076914193325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.006062226799258497, + "learning_rate": 1.3759999999999998e-06, + "loss": 0.0002, + "num_tokens": 77075516.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 97.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 71.34375, + "completions/mean_terminated_length": 71.34375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.3827231781960965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.0053206455431791255, + "learning_rate": 1.374e-06, + "loss": 0.0002, + "num_tokens": 77100391.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 163.90625, + "completions/mean_terminated_length": 163.90625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.3828386649728606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.0064053278620122, + "learning_rate": 1.372e-06, + "loss": 0.0003, + "num_tokens": 77127108.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 188.6875, + "completions/mean_terminated_length": 188.6875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.38295415174962466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0478515625, + "kl": 0.007275316464074422, + "learning_rate": 1.37e-06, + "loss": 0.0003, + "num_tokens": 77143322.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 98.0, + "completions/mean_terminated_length": 98.0, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.38306963852638876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.015865709210629575, + "learning_rate": 1.368e-06, + "loss": 0.0006, + "num_tokens": 77160634.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 129.65625, + "completions/mean_terminated_length": 129.65625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.3831851253031528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.008823346928693354, + "learning_rate": 1.3660000000000001e-06, + "loss": 0.0004, + "num_tokens": 77186927.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 223.53125, + "completions/mean_terminated_length": 223.53125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.38330061207991684, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.012757899981806986, + "learning_rate": 1.364e-06, + "loss": 0.0005, + "num_tokens": 77215424.0, + "reward": 3.4261794090270996, + "reward_std": 0.3397517502307892, + "rewards/reward_fn/mean": 3.4261794090270996, + "rewards/reward_fn/std": 0.3397516906261444, + "step": 3319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 158.8125, + "completions/mean_terminated_length": 158.8125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.38341609885668093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.012810848071239889, + "learning_rate": 1.362e-06, + "loss": 0.0005, + "num_tokens": 77235514.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.38353158563344497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.008466014784062281, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.0003, + "num_tokens": 77268526.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 81.0625, + "completions/mean_terminated_length": 81.0625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.383647072410209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0289306640625, + "kl": 0.0029653454757863074, + "learning_rate": 1.358e-06, + "loss": 0.0001, + "num_tokens": 77283248.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 102.71875, + "completions/mean_terminated_length": 102.71875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.3837625591869731, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458984375, + "kl": 0.0035257697409178945, + "learning_rate": 1.356e-06, + "loss": 0.0001, + "num_tokens": 77306983.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 149.28125, + "completions/mean_terminated_length": 149.28125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.38387804596373715, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.984375, + "kl": 0.00887096887890948, + "learning_rate": 1.354e-06, + "loss": 0.0004, + "num_tokens": 77330864.0, + "reward": 3.9617090225219727, + "reward_std": 0.15116530656814575, + "rewards/reward_fn/mean": 3.9617090225219727, + "rewards/reward_fn/std": 0.15116530656814575, + "step": 3324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 171.15625, + "completions/mean_terminated_length": 171.15625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.3839935327405012, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.008487971761496738, + "learning_rate": 1.352e-06, + "loss": 0.0003, + "num_tokens": 77346997.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 432.71875, + "completions/mean_terminated_length": 432.71875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.3841090195172653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03662109375, + "kl": 0.008624978225270752, + "learning_rate": 1.35e-06, + "loss": 0.0003, + "num_tokens": 77375724.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 309.375, + "completions/mean_terminated_length": 309.375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.3842245062940293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.008375002529646736, + "learning_rate": 1.348e-06, + "loss": 0.0003, + "num_tokens": 77401240.0, + "reward": 3.9263758659362793, + "reward_std": 0.41648170351982117, + "rewards/reward_fn/mean": 3.9263758659362793, + "rewards/reward_fn/std": 0.41648170351982117, + "step": 3327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 149.40625, + "completions/mean_terminated_length": 149.40625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.3843399930707934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.008557203414966352, + "learning_rate": 1.3460000000000001e-06, + "loss": 0.0003, + "num_tokens": 77429317.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 110.34375, + "completions/mean_terminated_length": 110.34375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.38445547984755746, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.015061118945595808, + "learning_rate": 1.344e-06, + "loss": 0.0006, + "num_tokens": 77443472.0, + "reward": 3.9745731353759766, + "reward_std": 0.14383532106876373, + "rewards/reward_fn/mean": 3.9745731353759766, + "rewards/reward_fn/std": 0.14383530616760254, + "step": 3329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 115.28125, + "completions/mean_terminated_length": 115.28125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.3845709666243215, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.010740112826169934, + "learning_rate": 1.342e-06, + "loss": 0.0004, + "num_tokens": 77457433.0, + "reward": 3.8204307556152344, + "reward_std": 0.38052427768707275, + "rewards/reward_fn/mean": 3.8204307556152344, + "rewards/reward_fn/std": 0.38052430748939514, + "step": 3330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 112.5625, + "completions/mean_terminated_length": 112.5625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.3846864534010856, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "kl": 0.0176955445058411, + "learning_rate": 1.34e-06, + "loss": 0.0007, + "num_tokens": 77491243.0, + "reward": 3.406209945678711, + "reward_std": 0.07040264457464218, + "rewards/reward_fn/mean": 3.406209945678711, + "rewards/reward_fn/std": 0.0704026073217392, + "step": 3331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 189.9375, + "completions/mean_terminated_length": 189.9375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.38480194017784963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.013536854181438684, + "learning_rate": 1.338e-06, + "loss": 0.0005, + "num_tokens": 77510121.0, + "reward": 3.807147979736328, + "reward_std": 0.40873798727989197, + "rewards/reward_fn/mean": 3.807147979736328, + "rewards/reward_fn/std": 0.40873798727989197, + "step": 3332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 196.84375, + "completions/mean_terminated_length": 196.84375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.3849174269546137, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.010802442324347794, + "learning_rate": 1.336e-06, + "loss": 0.0004, + "num_tokens": 77533124.0, + "reward": 3.9006805419921875, + "reward_std": 0.27110007405281067, + "rewards/reward_fn/mean": 3.9006805419921875, + "rewards/reward_fn/std": 0.27110010385513306, + "step": 3333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 197.4375, + "completions/mean_terminated_length": 197.4375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.38503291373137777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0419921875, + "kl": 0.006557671724294778, + "learning_rate": 1.334e-06, + "loss": 0.0003, + "num_tokens": 77552050.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 161.46875, + "completions/mean_terminated_length": 161.46875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.3851484005081418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.0073753345423028804, + "learning_rate": 1.332e-06, + "loss": 0.0003, + "num_tokens": 77577345.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.3852638872849059, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.008242218136729207, + "learning_rate": 1.33e-06, + "loss": 0.0003, + "num_tokens": 77595445.0, + "reward": 3.9787302017211914, + "reward_std": 0.12032012641429901, + "rewards/reward_fn/mean": 3.9787302017211914, + "rewards/reward_fn/std": 0.12032010406255722, + "step": 3336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 137.40625, + "completions/mean_terminated_length": 137.40625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.38537937406166994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060791015625, + "kl": 0.008620184307801537, + "learning_rate": 1.328e-06, + "loss": 0.0003, + "num_tokens": 77613282.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 179.125, + "completions/mean_terminated_length": 179.125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.385494860838434, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.007121676026144996, + "learning_rate": 1.326e-06, + "loss": 0.0003, + "num_tokens": 77631558.0, + "reward": 3.4166347980499268, + "reward_std": 0.05475785955786705, + "rewards/reward_fn/mean": 3.4166347980499268, + "rewards/reward_fn/std": 0.054757874459028244, + "step": 3338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 198.6875, + "completions/mean_terminated_length": 198.6875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.3856103476151981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.007084387594659347, + "learning_rate": 1.324e-06, + "loss": 0.0003, + "num_tokens": 77656284.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 170.96875, + "completions/mean_terminated_length": 170.96875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.3857258343919621, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.008240650706284214, + "learning_rate": 1.322e-06, + "loss": 0.0003, + "num_tokens": 77679739.0, + "reward": 3.931414842605591, + "reward_std": 0.38797616958618164, + "rewards/reward_fn/mean": 3.931414842605591, + "rewards/reward_fn/std": 0.38797613978385925, + "step": 3340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 62.03125, + "completions/mean_terminated_length": 62.03125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.38584132116872616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.369140625, + "kl": 0.025438629279960878, + "learning_rate": 1.32e-06, + "loss": 0.001, + "num_tokens": 77702620.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 54.28125, + "completions/mean_terminated_length": 54.28125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.38595680794549025, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.09375, + "kl": 0.010122501604200806, + "learning_rate": 1.318e-06, + "loss": 0.0004, + "num_tokens": 77728709.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 80.09375, + "completions/mean_terminated_length": 80.09375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.3860722947222543, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.007607296565765864, + "learning_rate": 1.316e-06, + "loss": 0.0003, + "num_tokens": 77749768.0, + "reward": 3.9741716384887695, + "reward_std": 0.1461067795753479, + "rewards/reward_fn/mean": 3.9741716384887695, + "rewards/reward_fn/std": 0.1461068093776703, + "step": 3343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 169.09375, + "completions/mean_terminated_length": 169.09375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.3861877814990184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.009389625505718868, + "learning_rate": 1.314e-06, + "loss": 0.0004, + "num_tokens": 77776299.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 310.78125, + "completions/mean_terminated_length": 310.78125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.38630326827578243, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.010356419908930548, + "learning_rate": 1.312e-06, + "loss": 0.0004, + "num_tokens": 77809092.0, + "reward": 2.557553768157959, + "reward_std": 1.050899624824524, + "rewards/reward_fn/mean": 2.557553768157959, + "rewards/reward_fn/std": 1.050899624824524, + "step": 3345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 157.25, + "completions/mean_terminated_length": 157.25, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.38641875505254647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.00943157700021402, + "learning_rate": 1.31e-06, + "loss": 0.0004, + "num_tokens": 77832460.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 161.78125, + "completions/mean_terminated_length": 161.78125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.38653424182931057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.008251196733908728, + "learning_rate": 1.308e-06, + "loss": 0.0003, + "num_tokens": 77850789.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 223.625, + "completions/mean_terminated_length": 223.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.3866497286060746, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.008569311874452978, + "learning_rate": 1.306e-06, + "loss": 0.0003, + "num_tokens": 77881081.0, + "reward": 3.9860806465148926, + "reward_std": 0.07873941212892532, + "rewards/reward_fn/mean": 3.9860806465148926, + "rewards/reward_fn/std": 0.07873937487602234, + "step": 3348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 93.5625, + "completions/mean_terminated_length": 93.5625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.38676521538283865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.00613209480070509, + "learning_rate": 1.304e-06, + "loss": 0.0002, + "num_tokens": 77906347.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 265.65625, + "completions/mean_terminated_length": 265.65625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.38688070215960274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.008301511443278287, + "learning_rate": 1.302e-06, + "loss": 0.0003, + "num_tokens": 77938176.0, + "reward": 3.9307289123535156, + "reward_std": 0.39185625314712524, + "rewards/reward_fn/mean": 3.9307289123535156, + "rewards/reward_fn/std": 0.39185619354248047, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 119.0, + "completions/max_terminated_length": 119.0, + "completions/mean_length": 73.40625, + "completions/mean_terminated_length": 73.40625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.3869961889363668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.005802022478746949, + "learning_rate": 1.3e-06, + "loss": 0.0002, + "num_tokens": 77959501.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 105.625, + "completions/mean_terminated_length": 105.625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.3871116757131308, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.007147781565436162, + "learning_rate": 1.298e-06, + "loss": 0.0003, + "num_tokens": 77981057.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 115.625, + "completions/mean_terminated_length": 115.625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.3872271624898949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.011457349821284879, + "learning_rate": 1.296e-06, + "loss": 0.0005, + "num_tokens": 78013237.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 311.875, + "completions/mean_terminated_length": 311.875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.38734264926665896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.00911935738986358, + "learning_rate": 1.294e-06, + "loss": 0.0004, + "num_tokens": 78039217.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 159.375, + "completions/mean_terminated_length": 159.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.38745813604342305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.007421811780659482, + "learning_rate": 1.292e-06, + "loss": 0.0003, + "num_tokens": 78055357.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.3875736228201871, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.011927559244213626, + "learning_rate": 1.29e-06, + "loss": 0.0005, + "num_tokens": 78082461.0, + "reward": 3.2687020301818848, + "reward_std": 0.5079047083854675, + "rewards/reward_fn/mean": 3.2687020301818848, + "rewards/reward_fn/std": 0.5079047083854675, + "step": 3356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 59.1875, + "completions/mean_terminated_length": 59.1875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.38768910959695113, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "kl": 0.013985386118292809, + "learning_rate": 1.288e-06, + "loss": 0.0006, + "num_tokens": 78100739.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 423.78125, + "completions/mean_terminated_length": 423.78125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.3878045963737152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.010971705836709589, + "learning_rate": 1.286e-06, + "loss": 0.0004, + "num_tokens": 78124572.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 61.34375, + "completions/mean_terminated_length": 61.34375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.38792008315047927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.37890625, + "kl": 0.02259004990628455, + "learning_rate": 1.284e-06, + "loss": 0.0009, + "num_tokens": 78148199.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 136.59375, + "completions/mean_terminated_length": 136.59375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.3880355699272433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.008788238421402639, + "learning_rate": 1.282e-06, + "loss": 0.0004, + "num_tokens": 78164538.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 252.3125, + "completions/mean_terminated_length": 252.3125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.3881510567040074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.006199375260621309, + "learning_rate": 1.28e-06, + "loss": 0.0002, + "num_tokens": 78188068.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 178.0, + "completions/mean_terminated_length": 178.0, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.38826654348077144, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.007522273204813246, + "learning_rate": 1.2779999999999999e-06, + "loss": 0.0003, + "num_tokens": 78210788.0, + "reward": 3.894749641418457, + "reward_std": 0.43896427750587463, + "rewards/reward_fn/mean": 3.894749641418457, + "rewards/reward_fn/std": 0.43896427750587463, + "step": 3362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 157.96875, + "completions/mean_terminated_length": 157.96875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.38838203025753554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.013303244588314556, + "learning_rate": 1.276e-06, + "loss": 0.0005, + "num_tokens": 78240291.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 101.3125, + "completions/mean_terminated_length": 101.3125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.3884975170342996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.009087925056519452, + "learning_rate": 1.274e-06, + "loss": 0.0004, + "num_tokens": 78255181.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 407.21875, + "completions/mean_terminated_length": 407.21875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.3886130038110636, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.010193504611379467, + "learning_rate": 1.272e-06, + "loss": 0.0004, + "num_tokens": 78280468.0, + "reward": 3.9280030727386475, + "reward_std": 0.4072764217853546, + "rewards/reward_fn/mean": 3.9280030727386475, + "rewards/reward_fn/std": 0.407276451587677, + "step": 3365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 393.6875, + "completions/mean_terminated_length": 393.6875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.3887284905878277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.011744963689125143, + "learning_rate": 1.27e-06, + "loss": 0.0005, + "num_tokens": 78305450.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 174.75, + "completions/mean_terminated_length": 174.75, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.38884397736459175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.013311140181031078, + "learning_rate": 1.268e-06, + "loss": 0.0005, + "num_tokens": 78331778.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 149.25, + "completions/mean_terminated_length": 149.25, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.3889594641413558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0380859375, + "kl": 0.005641784460749477, + "learning_rate": 1.266e-06, + "loss": 0.0002, + "num_tokens": 78358890.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 215.90625, + "completions/mean_terminated_length": 215.90625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.3890749509181199, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.010780859185615554, + "learning_rate": 1.2639999999999999e-06, + "loss": 0.0004, + "num_tokens": 78389959.0, + "reward": 3.7937536239624023, + "reward_std": 0.3970223367214203, + "rewards/reward_fn/mean": 3.7937536239624023, + "rewards/reward_fn/std": 0.3970223367214203, + "step": 3369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 168.71875, + "completions/mean_terminated_length": 168.71875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.38919043769488393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.016475447890115902, + "learning_rate": 1.262e-06, + "loss": 0.0007, + "num_tokens": 78414878.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 108.46875, + "completions/mean_terminated_length": 108.46875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.389305924471648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.00930326440720819, + "learning_rate": 1.26e-06, + "loss": 0.0004, + "num_tokens": 78445165.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 96.8125, + "completions/mean_terminated_length": 96.8125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.38942141124841206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.006196314448970952, + "learning_rate": 1.2579999999999999e-06, + "loss": 0.0002, + "num_tokens": 78465959.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 104.71875, + "completions/mean_terminated_length": 104.71875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.3895368980251761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.011523336048412602, + "learning_rate": 1.256e-06, + "loss": 0.0005, + "num_tokens": 78491582.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 164.9375, + "completions/mean_terminated_length": 164.9375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.3896523848019402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041259765625, + "kl": 0.006433160331653198, + "learning_rate": 1.254e-06, + "loss": 0.0003, + "num_tokens": 78506524.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 85.3125, + "completions/mean_terminated_length": 85.3125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.38976787157870424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1611328125, + "kl": 0.01487214773078449, + "learning_rate": 1.252e-06, + "loss": 0.0006, + "num_tokens": 78529574.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 169.5625, + "completions/mean_terminated_length": 169.5625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.3898833583554683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037841796875, + "kl": 0.005057143171143252, + "learning_rate": 1.2499999999999999e-06, + "loss": 0.0002, + "num_tokens": 78554680.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 265.71875, + "completions/mean_terminated_length": 265.71875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3899988451322324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.0075121142726857215, + "learning_rate": 1.248e-06, + "loss": 0.0003, + "num_tokens": 78579215.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 352.5, + "completions/mean_terminated_length": 352.5, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.3901143319089964, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.012559271563077345, + "learning_rate": 1.246e-06, + "loss": 0.0005, + "num_tokens": 78601247.0, + "reward": 3.7766294479370117, + "reward_std": 0.7056067585945129, + "rewards/reward_fn/mean": 3.7766294479370117, + "rewards/reward_fn/std": 0.7056068181991577, + "step": 3378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 81.375, + "completions/mean_terminated_length": 81.375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.39022981868576045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.008250873957877047, + "learning_rate": 1.2439999999999999e-06, + "loss": 0.0003, + "num_tokens": 78626347.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 219.8125, + "completions/mean_terminated_length": 219.8125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.39034530546252455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0400390625, + "kl": 0.0072002023152890615, + "learning_rate": 1.242e-06, + "loss": 0.0003, + "num_tokens": 78654917.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 104.65625, + "completions/mean_terminated_length": 104.65625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.3904607922392886, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.01129025433328934, + "learning_rate": 1.24e-06, + "loss": 0.0005, + "num_tokens": 78676666.0, + "reward": 3.974565029144287, + "reward_std": 0.14388108253479004, + "rewards/reward_fn/mean": 3.974565029144287, + "rewards/reward_fn/std": 0.14388103783130646, + "step": 3381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 232.625, + "completions/mean_terminated_length": 232.625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.3905762790160527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.010995593795087188, + "learning_rate": 1.2379999999999998e-06, + "loss": 0.0004, + "num_tokens": 78694446.0, + "reward": 3.933448553085327, + "reward_std": 0.37647169828414917, + "rewards/reward_fn/mean": 3.933448553085327, + "rewards/reward_fn/std": 0.3764716386795044, + "step": 3382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 185.34375, + "completions/mean_terminated_length": 185.34375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.3906917657928167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.008862176073307637, + "learning_rate": 1.236e-06, + "loss": 0.0004, + "num_tokens": 78715353.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 458.0625, + "completions/mean_terminated_length": 406.774169921875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.39080725256958077, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.012120617509935983, + "learning_rate": 1.234e-06, + "loss": 0.0005, + "num_tokens": 78751867.0, + "reward": 3.8006186485290527, + "reward_std": 0.8110783100128174, + "rewards/reward_fn/mean": 3.8006186485290527, + "rewards/reward_fn/std": 0.8110783696174622, + "step": 3384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 83.1875, + "completions/mean_terminated_length": 83.1875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.39092273934634486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035400390625, + "kl": 0.002661524798895698, + "learning_rate": 1.232e-06, + "loss": 0.0001, + "num_tokens": 78775457.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 210.5625, + "completions/mean_terminated_length": 210.5625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.3910382261231089, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.010377262311521918, + "learning_rate": 1.2299999999999999e-06, + "loss": 0.0004, + "num_tokens": 78798099.0, + "reward": 3.894011974334717, + "reward_std": 0.4435270130634308, + "rewards/reward_fn/mean": 3.894011974334717, + "rewards/reward_fn/std": 0.443526953458786, + "step": 3386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 120.375, + "completions/mean_terminated_length": 120.375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.39115371289987294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.016967828807537444, + "learning_rate": 1.228e-06, + "loss": 0.0007, + "num_tokens": 78820991.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 89.71875, + "completions/mean_terminated_length": 89.71875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.39126919967663704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.006880851731693838, + "learning_rate": 1.226e-06, + "loss": 0.0003, + "num_tokens": 78842038.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 169.21875, + "completions/mean_terminated_length": 169.21875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.3913846864534011, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.007253772095282329, + "learning_rate": 1.2239999999999998e-06, + "loss": 0.0003, + "num_tokens": 78858781.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 105.4375, + "completions/mean_terminated_length": 105.4375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.39150017323016517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.005751350829086732, + "learning_rate": 1.222e-06, + "loss": 0.0002, + "num_tokens": 78879083.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 262.46875, + "completions/mean_terminated_length": 262.46875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.3916156600069292, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.00864266324788332, + "learning_rate": 1.22e-06, + "loss": 0.0003, + "num_tokens": 78905178.0, + "reward": 3.92830491065979, + "reward_std": 0.4055686295032501, + "rewards/reward_fn/mean": 3.92830491065979, + "rewards/reward_fn/std": 0.4055686295032501, + "step": 3391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 137.25, + "completions/mean_terminated_length": 137.25, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.39173114678369325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.01430725037062075, + "learning_rate": 1.2179999999999998e-06, + "loss": 0.0006, + "num_tokens": 78927746.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 117.96875, + "completions/mean_terminated_length": 117.96875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.39184663356045735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.008848129262332805, + "learning_rate": 1.2159999999999999e-06, + "loss": 0.0004, + "num_tokens": 78958817.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 243.53125, + "completions/mean_terminated_length": 243.53125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.3919621203372214, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.02902104090026114, + "learning_rate": 1.214e-06, + "loss": 0.0012, + "num_tokens": 78990162.0, + "reward": 3.6067709922790527, + "reward_std": 0.5235899090766907, + "rewards/reward_fn/mean": 3.6067709922790527, + "rewards/reward_fn/std": 0.5235898494720459, + "step": 3394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 85.3125, + "completions/mean_terminated_length": 85.3125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3920776071139854, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.140625, + "kl": 0.005716103600207134, + "learning_rate": 1.212e-06, + "loss": 0.0002, + "num_tokens": 79003580.0, + "reward": 3.2446131706237793, + "reward_std": 0.05310458689928055, + "rewards/reward_fn/mean": 3.2446131706237793, + "rewards/reward_fn/std": 0.05310462787747383, + "step": 3395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 207.21875, + "completions/mean_terminated_length": 207.21875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.3921930938907495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0400390625, + "kl": 0.006442922123824246, + "learning_rate": 1.2099999999999998e-06, + "loss": 0.0003, + "num_tokens": 79022403.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 95.78125, + "completions/mean_terminated_length": 95.78125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.39230858066751356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.007721657235379098, + "learning_rate": 1.208e-06, + "loss": 0.0003, + "num_tokens": 79043004.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 205.8125, + "completions/mean_terminated_length": 205.8125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.39242406744427766, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.01016439514933154, + "learning_rate": 1.206e-06, + "loss": 0.0004, + "num_tokens": 79070710.0, + "reward": 3.286691665649414, + "reward_std": 0.51935213804245, + "rewards/reward_fn/mean": 3.286691665649414, + "rewards/reward_fn/std": 0.5193520784378052, + "step": 3398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 54.6875, + "completions/mean_terminated_length": 54.6875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.3925395542210417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2890625, + "kl": 0.016185460721317213, + "learning_rate": 1.2039999999999998e-06, + "loss": 0.0006, + "num_tokens": 79090252.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 211.75, + "completions/mean_terminated_length": 211.75, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.39265504099780574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.01159271816140972, + "learning_rate": 1.2019999999999999e-06, + "loss": 0.0005, + "num_tokens": 79109732.0, + "reward": 3.2701315879821777, + "reward_std": 0.47073933482170105, + "rewards/reward_fn/mean": 3.2701315879821777, + "rewards/reward_fn/std": 0.47073930501937866, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 199.96875, + "completions/mean_terminated_length": 199.96875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.39277052777456983, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.010958836719510145, + "learning_rate": 1.2e-06, + "loss": 0.0004, + "num_tokens": 79128899.0, + "reward": 3.129216194152832, + "reward_std": 0.5854543447494507, + "rewards/reward_fn/mean": 3.129216194152832, + "rewards/reward_fn/std": 0.5854542851448059, + "step": 3401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 63.84375, + "completions/mean_terminated_length": 63.84375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.3928860145513339, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.004464033412659774, + "learning_rate": 1.1979999999999998e-06, + "loss": 0.0002, + "num_tokens": 79152574.0, + "reward": 3.68819260597229, + "reward_std": 0.018529005348682404, + "rewards/reward_fn/mean": 3.68819260597229, + "rewards/reward_fn/std": 0.01852901093661785, + "step": 3402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 110.375, + "completions/mean_terminated_length": 110.375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.3930015013280979, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.010728218505391851, + "learning_rate": 1.1959999999999999e-06, + "loss": 0.0004, + "num_tokens": 79173802.0, + "reward": 3.3655471801757812, + "reward_std": 0.03787308931350708, + "rewards/reward_fn/mean": 3.3655471801757812, + "rewards/reward_fn/std": 0.03787313774228096, + "step": 3403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 144.25, + "completions/mean_terminated_length": 144.25, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.393116988104862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.010453785558638629, + "learning_rate": 1.194e-06, + "loss": 0.0004, + "num_tokens": 79200818.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 111.84375, + "completions/mean_terminated_length": 111.84375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.39323247488162605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.005008871707104845, + "learning_rate": 1.192e-06, + "loss": 0.0002, + "num_tokens": 79223597.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 104.0, + "completions/mean_terminated_length": 104.0, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.3933479616583901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.010106239533342887, + "learning_rate": 1.1899999999999998e-06, + "loss": 0.0004, + "num_tokens": 79243117.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 171.1875, + "completions/mean_terminated_length": 171.1875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.3934634484351542, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.011699369933921844, + "learning_rate": 1.1879999999999999e-06, + "loss": 0.0005, + "num_tokens": 79262131.0, + "reward": 3.0389389991760254, + "reward_std": 0.327124685049057, + "rewards/reward_fn/mean": 3.0389389991760254, + "rewards/reward_fn/std": 0.32712462544441223, + "step": 3407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3935789352119182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.005760640600783518, + "learning_rate": 1.186e-06, + "loss": 0.0002, + "num_tokens": 79287715.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 165.65625, + "completions/mean_terminated_length": 165.65625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.3936944219886823, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.007349457220698241, + "learning_rate": 1.1839999999999998e-06, + "loss": 0.0003, + "num_tokens": 79317016.0, + "reward": 3.2134456634521484, + "reward_std": 0.05085834115743637, + "rewards/reward_fn/mean": 3.2134456634521484, + "rewards/reward_fn/std": 0.050858307629823685, + "step": 3409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 217.21875, + "completions/mean_terminated_length": 217.21875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.39380990876544636, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.01337398674513679, + "learning_rate": 1.1819999999999999e-06, + "loss": 0.0005, + "num_tokens": 79340799.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 107.875, + "completions/mean_terminated_length": 107.875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.3939253955422104, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.007775934398523532, + "learning_rate": 1.18e-06, + "loss": 0.0003, + "num_tokens": 79354299.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 157.5625, + "completions/mean_terminated_length": 157.5625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.3940408823189745, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.008727975000510924, + "learning_rate": 1.178e-06, + "loss": 0.0003, + "num_tokens": 79378573.0, + "reward": 3.2329652309417725, + "reward_std": 0.4193273186683655, + "rewards/reward_fn/mean": 3.2329652309417725, + "rewards/reward_fn/std": 0.4193272292613983, + "step": 3412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 127.0, + "completions/mean_terminated_length": 127.0, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.39415636909573853, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "kl": 0.010842679112101905, + "learning_rate": 1.1759999999999998e-06, + "loss": 0.0004, + "num_tokens": 79394861.0, + "reward": 3.8593673706054688, + "reward_std": 0.5535110235214233, + "rewards/reward_fn/mean": 3.8593673706054688, + "rewards/reward_fn/std": 0.5535110235214233, + "step": 3413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 218.125, + "completions/mean_terminated_length": 218.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.3942718558725026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.010528347993385978, + "learning_rate": 1.174e-06, + "loss": 0.0004, + "num_tokens": 79424561.0, + "reward": 3.9757285118103027, + "reward_std": 0.13730043172836304, + "rewards/reward_fn/mean": 3.9757285118103027, + "rewards/reward_fn/std": 0.13730047643184662, + "step": 3414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 110.15625, + "completions/mean_terminated_length": 110.15625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.39438734264926667, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.009293150222219992, + "learning_rate": 1.172e-06, + "loss": 0.0004, + "num_tokens": 79446230.0, + "reward": 3.8899130821228027, + "reward_std": 0.2601983845233917, + "rewards/reward_fn/mean": 3.8899130821228027, + "rewards/reward_fn/std": 0.2601983845233917, + "step": 3415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 62.59375, + "completions/mean_terminated_length": 62.59375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.3945028294260307, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.78125, + "kl": 0.010882222690270282, + "learning_rate": 1.1699999999999998e-06, + "loss": 0.0004, + "num_tokens": 79474537.0, + "reward": 3.729227304458618, + "reward_std": 0.02100970409810543, + "rewards/reward_fn/mean": 3.729227304458618, + "rewards/reward_fn/std": 0.021009642630815506, + "step": 3416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 311.6875, + "completions/mean_terminated_length": 311.6875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.3946183162027948, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.008133621638990007, + "learning_rate": 1.1679999999999999e-06, + "loss": 0.0003, + "num_tokens": 79500607.0, + "reward": 3.928797960281372, + "reward_std": 0.4027792513370514, + "rewards/reward_fn/mean": 3.928797960281372, + "rewards/reward_fn/std": 0.402779221534729, + "step": 3417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 90.1875, + "completions/mean_terminated_length": 90.1875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.39473380297955885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.00862209034676198, + "learning_rate": 1.166e-06, + "loss": 0.0003, + "num_tokens": 79520421.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 247.1875, + "completions/mean_terminated_length": 247.1875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.3948492897563229, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.009237469952495303, + "learning_rate": 1.1639999999999998e-06, + "loss": 0.0004, + "num_tokens": 79544363.0, + "reward": 2.8115196228027344, + "reward_std": 0.2257312685251236, + "rewards/reward_fn/mean": 2.8115196228027344, + "rewards/reward_fn/std": 0.22573129832744598, + "step": 3419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 342.125, + "completions/mean_terminated_length": 342.125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.394964776533087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.01283966151822824, + "learning_rate": 1.1619999999999998e-06, + "loss": 0.0005, + "num_tokens": 79569391.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 101.0625, + "completions/mean_terminated_length": 101.0625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.395080263309851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.010280993195920018, + "learning_rate": 1.16e-06, + "loss": 0.0004, + "num_tokens": 79589329.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 167.25, + "completions/mean_terminated_length": 167.25, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.39519575008661506, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.009999039575632196, + "learning_rate": 1.158e-06, + "loss": 0.0004, + "num_tokens": 79616057.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 84.71875, + "completions/mean_terminated_length": 84.71875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.39531123686337916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.007001488698733738, + "learning_rate": 1.1559999999999998e-06, + "loss": 0.0003, + "num_tokens": 79644560.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 161.0625, + "completions/mean_terminated_length": 161.0625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.3954267236401432, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.014203473707311787, + "learning_rate": 1.1539999999999999e-06, + "loss": 0.0006, + "num_tokens": 79672722.0, + "reward": 3.3622560501098633, + "reward_std": 0.124803327023983, + "rewards/reward_fn/mean": 3.3622560501098633, + "rewards/reward_fn/std": 0.12480339407920837, + "step": 3424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 201.1875, + "completions/mean_terminated_length": 201.1875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.3955422104169073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.013542123648221605, + "learning_rate": 1.152e-06, + "loss": 0.0005, + "num_tokens": 79697624.0, + "reward": 3.971544027328491, + "reward_std": 0.16097134351730347, + "rewards/reward_fn/mean": 3.971544027328491, + "rewards/reward_fn/std": 0.16097137331962585, + "step": 3425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 255.15625, + "completions/mean_terminated_length": 255.15625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.39565769719367133, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.011149133613798767, + "learning_rate": 1.1499999999999998e-06, + "loss": 0.0004, + "num_tokens": 79717917.0, + "reward": 3.928313732147217, + "reward_std": 0.405518501996994, + "rewards/reward_fn/mean": 3.928313732147217, + "rewards/reward_fn/std": 0.405518501996994, + "step": 3426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 83.1875, + "completions/mean_terminated_length": 83.1875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.39577318397043537, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.828125, + "kl": 0.007554177162091946, + "learning_rate": 1.1479999999999999e-06, + "loss": 0.0003, + "num_tokens": 79739747.0, + "reward": 3.9326038360595703, + "reward_std": 0.38124993443489075, + "rewards/reward_fn/mean": 3.9326038360595703, + "rewards/reward_fn/std": 0.38124993443489075, + "step": 3427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 94.90625, + "completions/mean_terminated_length": 94.90625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.39588867074719947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.009267215285944985, + "learning_rate": 1.146e-06, + "loss": 0.0004, + "num_tokens": 79754944.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 87.21875, + "completions/mean_terminated_length": 87.21875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.3960041575239635, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.390625, + "kl": 0.011820434403489344, + "learning_rate": 1.1439999999999998e-06, + "loss": 0.0005, + "num_tokens": 79774695.0, + "reward": 3.4725492000579834, + "reward_std": 0.6082494854927063, + "rewards/reward_fn/mean": 3.4725492000579834, + "rewards/reward_fn/std": 0.6082494854927063, + "step": 3429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 136.09375, + "completions/mean_terminated_length": 136.09375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.39611964430072755, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.003576393172807002, + "learning_rate": 1.1419999999999998e-06, + "loss": 0.0001, + "num_tokens": 79797866.0, + "reward": 3.979501247406006, + "reward_std": 0.11595765501260757, + "rewards/reward_fn/mean": 3.979501247406006, + "rewards/reward_fn/std": 0.11595765501260757, + "step": 3430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 362.8125, + "completions/mean_terminated_length": 362.8125, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.39623513107749164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037353515625, + "kl": 0.008044038455409463, + "learning_rate": 1.1399999999999999e-06, + "loss": 0.0003, + "num_tokens": 79824548.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 87.53125, + "completions/mean_terminated_length": 87.53125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.3963506178542557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.007856206175347324, + "learning_rate": 1.138e-06, + "loss": 0.0003, + "num_tokens": 79847573.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 174.125, + "completions/mean_terminated_length": 174.125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.3964661046310197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.006466274804552086, + "learning_rate": 1.1359999999999998e-06, + "loss": 0.0003, + "num_tokens": 79876921.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 84.53125, + "completions/mean_terminated_length": 84.53125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.3965815914077838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.0059123493992956355, + "learning_rate": 1.1339999999999999e-06, + "loss": 0.0002, + "num_tokens": 79897290.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 66.0, + "completions/mean_terminated_length": 66.0, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.39669707818454786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.008239696948294295, + "learning_rate": 1.132e-06, + "loss": 0.0003, + "num_tokens": 79920586.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 83.125, + "completions/mean_terminated_length": 83.125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.39681256496131195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.01342466805363074, + "learning_rate": 1.1299999999999998e-06, + "loss": 0.0005, + "num_tokens": 79944526.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 168.5, + "completions/mean_terminated_length": 168.5, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.396928051738076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.015449210521182977, + "learning_rate": 1.1279999999999998e-06, + "loss": 0.0006, + "num_tokens": 79961534.0, + "reward": 3.3143508434295654, + "reward_std": 0.07713533192873001, + "rewards/reward_fn/mean": 3.3143508434295654, + "rewards/reward_fn/std": 0.07713530957698822, + "step": 3437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 117.09375, + "completions/mean_terminated_length": 117.09375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.39704353851484003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.012018805071420502, + "learning_rate": 1.1259999999999999e-06, + "loss": 0.0005, + "num_tokens": 79983681.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 188.84375, + "completions/mean_terminated_length": 188.84375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.39715902529160413, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.007834798947442323, + "learning_rate": 1.1240000000000002e-06, + "loss": 0.0003, + "num_tokens": 80007452.0, + "reward": 3.9683585166931152, + "reward_std": 0.178990438580513, + "rewards/reward_fn/mean": 3.9683585166931152, + "rewards/reward_fn/std": 0.1789904087781906, + "step": 3439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 211.0625, + "completions/mean_terminated_length": 211.0625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.39727451206836817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.011234495512326248, + "learning_rate": 1.122e-06, + "loss": 0.0004, + "num_tokens": 80040190.0, + "reward": 2.7949984073638916, + "reward_std": 0.41830962896347046, + "rewards/reward_fn/mean": 2.7949984073638916, + "rewards/reward_fn/std": 0.41830965876579285, + "step": 3440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 318.5625, + "completions/mean_terminated_length": 318.5625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.3973899988451322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.008048995012359228, + "learning_rate": 1.12e-06, + "loss": 0.0003, + "num_tokens": 80065008.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 118.6875, + "completions/mean_terminated_length": 118.6875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3975054856218963, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.01124204316874966, + "learning_rate": 1.1180000000000001e-06, + "loss": 0.0004, + "num_tokens": 80093062.0, + "reward": 3.8654370307922363, + "reward_std": 0.5295272469520569, + "rewards/reward_fn/mean": 3.8654370307922363, + "rewards/reward_fn/std": 0.5295271873474121, + "step": 3442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 347.46875, + "completions/mean_terminated_length": 347.46875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.39762097239866034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.007736592007859144, + "learning_rate": 1.116e-06, + "loss": 0.0003, + "num_tokens": 80119509.0, + "reward": 3.8546981811523438, + "reward_std": 0.5717868804931641, + "rewards/reward_fn/mean": 3.8546981811523438, + "rewards/reward_fn/std": 0.5717868208885193, + "step": 3443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 344.03125, + "completions/mean_terminated_length": 344.03125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.39773645917542444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.008644154600915499, + "learning_rate": 1.114e-06, + "loss": 0.0003, + "num_tokens": 80146678.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 117.65625, + "completions/mean_terminated_length": 117.65625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.3978519459521885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.00890123681165278, + "learning_rate": 1.1120000000000001e-06, + "loss": 0.0004, + "num_tokens": 80173675.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 93.90625, + "completions/mean_terminated_length": 93.90625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.3979674327289525, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.75, + "kl": 0.008107106281386223, + "learning_rate": 1.11e-06, + "loss": 0.0003, + "num_tokens": 80193480.0, + "reward": 3.9331369400024414, + "reward_std": 0.3782346546649933, + "rewards/reward_fn/mean": 3.9331369400024414, + "rewards/reward_fn/std": 0.3782346546649933, + "step": 3446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 283.25, + "completions/mean_terminated_length": 283.25, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.3980829195057166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.012603557770489715, + "learning_rate": 1.108e-06, + "loss": 0.0005, + "num_tokens": 80210800.0, + "reward": 3.7182390689849854, + "reward_std": 0.45877668261528015, + "rewards/reward_fn/mean": 3.7182390689849854, + "rewards/reward_fn/std": 0.45877668261528015, + "step": 3447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 133.5, + "completions/mean_terminated_length": 133.5, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.39819840628248065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.010075225509353913, + "learning_rate": 1.106e-06, + "loss": 0.0004, + "num_tokens": 80234304.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 209.5625, + "completions/mean_terminated_length": 209.5625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.3983138930592447, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.008220012634410523, + "learning_rate": 1.1040000000000001e-06, + "loss": 0.0003, + "num_tokens": 80259346.0, + "reward": 3.6776986122131348, + "reward_std": 0.7813527584075928, + "rewards/reward_fn/mean": 3.6776986122131348, + "rewards/reward_fn/std": 0.7813528180122375, + "step": 3449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 92.96875, + "completions/mean_terminated_length": 92.96875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.3984293798360088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.01663708910200512, + "learning_rate": 1.102e-06, + "loss": 0.0007, + "num_tokens": 80277137.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 187.75, + "completions/mean_terminated_length": 187.75, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.39854486661277283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03955078125, + "kl": 0.007285414380021393, + "learning_rate": 1.1e-06, + "loss": 0.0003, + "num_tokens": 80297321.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 199.75, + "completions/mean_terminated_length": 199.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.3986603533895369, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.007876015421061311, + "learning_rate": 1.0980000000000001e-06, + "loss": 0.0003, + "num_tokens": 80319073.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 80.625, + "completions/mean_terminated_length": 80.625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.39877584016630097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.003305647134766332, + "learning_rate": 1.096e-06, + "loss": 0.0001, + "num_tokens": 80342037.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 86.15625, + "completions/mean_terminated_length": 86.15625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.398891326943065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.012017641551210545, + "learning_rate": 1.094e-06, + "loss": 0.0005, + "num_tokens": 80355674.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 395.28125, + "completions/mean_terminated_length": 395.28125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3990068137198291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.011560611805180088, + "learning_rate": 1.092e-06, + "loss": 0.0005, + "num_tokens": 80376451.0, + "reward": 3.5297389030456543, + "reward_std": 0.7971243262290955, + "rewards/reward_fn/mean": 3.5297389030456543, + "rewards/reward_fn/std": 0.7971243262290955, + "step": 3455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 285.40625, + "completions/mean_terminated_length": 285.40625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.39912230049659314, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.015951176537782885, + "learning_rate": 1.09e-06, + "loss": 0.0006, + "num_tokens": 80401936.0, + "reward": 3.3709094524383545, + "reward_std": 0.5815379023551941, + "rewards/reward_fn/mean": 3.3709094524383545, + "rewards/reward_fn/std": 0.5815378427505493, + "step": 3456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 236.15625, + "completions/mean_terminated_length": 236.15625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.3992377872733572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.015793327838764526, + "learning_rate": 1.088e-06, + "loss": 0.0006, + "num_tokens": 80425845.0, + "reward": 2.89840030670166, + "reward_std": 0.20947635173797607, + "rewards/reward_fn/mean": 2.89840030670166, + "rewards/reward_fn/std": 0.20947633683681488, + "step": 3457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 108.40625, + "completions/mean_terminated_length": 108.40625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.3993532740501213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.002983710601256462, + "learning_rate": 1.086e-06, + "loss": 0.0001, + "num_tokens": 80441506.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 288.28125, + "completions/mean_terminated_length": 288.28125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.3994687608268853, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.0155018801742699, + "learning_rate": 1.0840000000000001e-06, + "loss": 0.0006, + "num_tokens": 80477995.0, + "reward": 2.821277618408203, + "reward_std": 0.7391989827156067, + "rewards/reward_fn/mean": 2.821277618408203, + "rewards/reward_fn/std": 0.7391989827156067, + "step": 3459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 138.875, + "completions/mean_terminated_length": 138.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.39958424760364936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2099609375, + "kl": 0.027539333124877885, + "learning_rate": 1.082e-06, + "loss": 0.0011, + "num_tokens": 80503943.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 217.28125, + "completions/mean_terminated_length": 217.28125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.39969973438041345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.01399982170551084, + "learning_rate": 1.08e-06, + "loss": 0.0006, + "num_tokens": 80529520.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 72.46875, + "completions/mean_terminated_length": 72.46875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.3998152211571775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.003251578560593771, + "learning_rate": 1.078e-06, + "loss": 0.0001, + "num_tokens": 80543231.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 116.90625, + "completions/mean_terminated_length": 116.90625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3999307079339416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.012513674977526534, + "learning_rate": 1.076e-06, + "loss": 0.0005, + "num_tokens": 80568988.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 159.8125, + "completions/mean_terminated_length": 159.8125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.4000461947107056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.010046535622677766, + "learning_rate": 1.074e-06, + "loss": 0.0004, + "num_tokens": 80587478.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 112.09375, + "completions/mean_terminated_length": 112.09375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.40016168148746967, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.34375, + "kl": 0.011406238452764228, + "learning_rate": 1.072e-06, + "loss": 0.0005, + "num_tokens": 80610425.0, + "reward": 3.736330032348633, + "reward_std": 0.5064060091972351, + "rewards/reward_fn/mean": 3.736330032348633, + "rewards/reward_fn/std": 0.5064059495925903, + "step": 3465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 355.6875, + "completions/mean_terminated_length": 355.6875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.40027716826423376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.012057443338562734, + "learning_rate": 1.07e-06, + "loss": 0.0005, + "num_tokens": 80640687.0, + "reward": 3.853006362915039, + "reward_std": 0.3990156352519989, + "rewards/reward_fn/mean": 3.853006362915039, + "rewards/reward_fn/std": 0.3990156352519989, + "step": 3466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 119.625, + "completions/mean_terminated_length": 119.625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.4003926550409978, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.014318689121864736, + "learning_rate": 1.068e-06, + "loss": 0.0006, + "num_tokens": 80664323.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 97.875, + "completions/mean_terminated_length": 97.875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.40050814181776184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.0059935845874861116, + "learning_rate": 1.066e-06, + "loss": 0.0002, + "num_tokens": 80686783.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 54.0, + "completions/mean_terminated_length": 54.0, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.40062362859452594, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.59375, + "kl": 0.019370375335711287, + "learning_rate": 1.064e-06, + "loss": 0.0008, + "num_tokens": 80699679.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 171.46875, + "completions/mean_terminated_length": 171.46875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.40073911537129, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.013423645665170625, + "learning_rate": 1.062e-06, + "loss": 0.0005, + "num_tokens": 80717166.0, + "reward": 3.9642534255981445, + "reward_std": 0.20221278071403503, + "rewards/reward_fn/mean": 3.9642534255981445, + "rewards/reward_fn/std": 0.20221273601055145, + "step": 3470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 252.34375, + "completions/mean_terminated_length": 194.41934204101562, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.4008546021480541, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "kl": 0.008988373738247901, + "learning_rate": 1.06e-06, + "loss": 0.0004, + "num_tokens": 80737561.0, + "reward": 3.840367555618286, + "reward_std": 0.7276288270950317, + "rewards/reward_fn/mean": 3.840367555618286, + "rewards/reward_fn/std": 0.7276288270950317, + "step": 3471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 144.0, + "completions/mean_terminated_length": 144.0, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.4009700889248181, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.008204443387512583, + "learning_rate": 1.058e-06, + "loss": 0.0003, + "num_tokens": 80763449.0, + "reward": 3.928945541381836, + "reward_std": 0.16797304153442383, + "rewards/reward_fn/mean": 3.928945541381836, + "rewards/reward_fn/std": 0.16797307133674622, + "step": 3472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 228.78125, + "completions/mean_terminated_length": 228.78125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.40108557570158215, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.018511667891289108, + "learning_rate": 1.056e-06, + "loss": 0.0007, + "num_tokens": 80793970.0, + "reward": 3.962057590484619, + "reward_std": 0.14930294454097748, + "rewards/reward_fn/mean": 3.962057590484619, + "rewards/reward_fn/std": 0.14930292963981628, + "step": 3473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 220.0, + "completions/mean_terminated_length": 220.0, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.40120106247834625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.008483878249535337, + "learning_rate": 1.054e-06, + "loss": 0.0003, + "num_tokens": 80813298.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 140.9375, + "completions/mean_terminated_length": 140.9375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.4013165492551103, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "kl": 0.016028323356295004, + "learning_rate": 1.052e-06, + "loss": 0.0006, + "num_tokens": 80829072.0, + "reward": 3.2491962909698486, + "reward_std": 0.13250786066055298, + "rewards/reward_fn/mean": 3.2491962909698486, + "rewards/reward_fn/std": 0.13250786066055298, + "step": 3475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 102.3125, + "completions/mean_terminated_length": 102.3125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.40143203603187433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.006808467835071497, + "learning_rate": 1.05e-06, + "loss": 0.0003, + "num_tokens": 80857562.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 114.15625, + "completions/mean_terminated_length": 114.15625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4015475228086384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.008016580162802711, + "learning_rate": 1.048e-06, + "loss": 0.0003, + "num_tokens": 80885215.0, + "reward": 2.8834762573242188, + "reward_std": 0.023695101961493492, + "rewards/reward_fn/mean": 2.8834762573242188, + "rewards/reward_fn/std": 0.023695088922977448, + "step": 3477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 366.40625, + "completions/mean_terminated_length": 366.40625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.40166300958540246, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.009028114844113588, + "learning_rate": 1.046e-06, + "loss": 0.0004, + "num_tokens": 80906764.0, + "reward": 3.926952838897705, + "reward_std": 0.41321703791618347, + "rewards/reward_fn/mean": 3.926952838897705, + "rewards/reward_fn/std": 0.41321706771850586, + "step": 3478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1300.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 393.1875, + "completions/mean_terminated_length": 393.1875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.40177849636216656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.009366089056129567, + "learning_rate": 1.044e-06, + "loss": 0.0004, + "num_tokens": 80942706.0, + "reward": 2.9100236892700195, + "reward_std": 0.6423819661140442, + "rewards/reward_fn/mean": 2.9100236892700195, + "rewards/reward_fn/std": 0.6423819065093994, + "step": 3479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 130.25, + "completions/mean_terminated_length": 130.25, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4018939831389306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458984375, + "kl": 0.005892742934520356, + "learning_rate": 1.042e-06, + "loss": 0.0002, + "num_tokens": 80959258.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 372.28125, + "completions/mean_terminated_length": 372.28125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.40200946991569464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.009152782353339717, + "learning_rate": 1.04e-06, + "loss": 0.0004, + "num_tokens": 80987299.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 81.5, + "completions/mean_terminated_length": 81.5, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.40212495669245873, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.003998860782303382, + "learning_rate": 1.038e-06, + "loss": 0.0002, + "num_tokens": 81010387.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 277.625, + "completions/mean_terminated_length": 277.625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.4022404434692228, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.008176561808795668, + "learning_rate": 1.036e-06, + "loss": 0.0003, + "num_tokens": 81038983.0, + "reward": 2.7660207748413086, + "reward_std": 0.03301587700843811, + "rewards/reward_fn/mean": 2.7660207748413086, + "rewards/reward_fn/std": 0.03301585465669632, + "step": 3483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 238.78125, + "completions/mean_terminated_length": 238.78125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.4023559302459868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.007357733273238409, + "learning_rate": 1.034e-06, + "loss": 0.0003, + "num_tokens": 81061728.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 198.03125, + "completions/mean_terminated_length": 198.03125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.4024714170227509, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.014712274074554443, + "learning_rate": 1.032e-06, + "loss": 0.0006, + "num_tokens": 81089505.0, + "reward": 3.6072592735290527, + "reward_std": 0.4785803556442261, + "rewards/reward_fn/mean": 3.6072592735290527, + "rewards/reward_fn/std": 0.4785803258419037, + "step": 3485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 285.5, + "completions/mean_terminated_length": 285.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.40258690379951495, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.009220389394613449, + "learning_rate": 1.0299999999999999e-06, + "loss": 0.0004, + "num_tokens": 81117745.0, + "reward": 2.7746455669403076, + "reward_std": 0.19210900366306305, + "rewards/reward_fn/mean": 2.7746455669403076, + "rewards/reward_fn/std": 0.19210900366306305, + "step": 3486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 203.8125, + "completions/mean_terminated_length": 203.8125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.402702390576279, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.014000469920574687, + "learning_rate": 1.028e-06, + "loss": 0.0006, + "num_tokens": 81139211.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 166.0625, + "completions/mean_terminated_length": 166.0625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.4028178773530431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.011443889896327164, + "learning_rate": 1.026e-06, + "loss": 0.0005, + "num_tokens": 81153741.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 109.71875, + "completions/mean_terminated_length": 109.71875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.4029333641298071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.011371338630851824, + "learning_rate": 1.024e-06, + "loss": 0.0005, + "num_tokens": 81180836.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 76.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 52.90625, + "completions/mean_terminated_length": 52.90625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4030488509065712, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.921875, + "kl": 0.008987810870166868, + "learning_rate": 1.022e-06, + "loss": 0.0004, + "num_tokens": 81194209.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 273.90625, + "completions/mean_terminated_length": 273.90625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.40316433768333526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.010601697154925205, + "learning_rate": 1.02e-06, + "loss": 0.0004, + "num_tokens": 81227294.0, + "reward": 3.857217788696289, + "reward_std": 0.5618411898612976, + "rewards/reward_fn/mean": 3.857217788696289, + "rewards/reward_fn/std": 0.5618411898612976, + "step": 3491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 75.1875, + "completions/mean_terminated_length": 75.1875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.4032798244600993, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.625, + "kl": 0.016803236801933963, + "learning_rate": 1.018e-06, + "loss": 0.0007, + "num_tokens": 81243396.0, + "reward": 3.913665533065796, + "reward_std": 0.20216771960258484, + "rewards/reward_fn/mean": 3.913665533065796, + "rewards/reward_fn/std": 0.20216773450374603, + "step": 3492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 77.84375, + "completions/mean_terminated_length": 77.84375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4033953112368634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.00496440540337062, + "learning_rate": 1.0159999999999999e-06, + "loss": 0.0002, + "num_tokens": 81267871.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 354.125, + "completions/mean_terminated_length": 354.125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.40351079801362744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.009007736603962258, + "learning_rate": 1.014e-06, + "loss": 0.0004, + "num_tokens": 81295267.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 147.125, + "completions/mean_terminated_length": 147.125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.4036262847903915, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.011341154233377893, + "learning_rate": 1.012e-06, + "loss": 0.0005, + "num_tokens": 81323079.0, + "reward": 3.721611499786377, + "reward_std": 0.3274502754211426, + "rewards/reward_fn/mean": 3.721611499786377, + "rewards/reward_fn/std": 0.32745030522346497, + "step": 3495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 108.3125, + "completions/mean_terminated_length": 108.3125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.40374177156715557, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.011280601516773459, + "learning_rate": 1.0099999999999999e-06, + "loss": 0.0004, + "num_tokens": 81339793.0, + "reward": 3.306112766265869, + "reward_std": 0.05988665297627449, + "rewards/reward_fn/mean": 3.306112766265869, + "rewards/reward_fn/std": 0.05988666042685509, + "step": 3496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 184.625, + "completions/mean_terminated_length": 184.625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.4038572583439196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.006613661280425731, + "learning_rate": 1.008e-06, + "loss": 0.0003, + "num_tokens": 81359845.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 203.5, + "completions/mean_terminated_length": 203.5, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.4039727451206837, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.0075788339381688274, + "learning_rate": 1.006e-06, + "loss": 0.0003, + "num_tokens": 81393109.0, + "reward": 3.7923130989074707, + "reward_std": 0.6560828685760498, + "rewards/reward_fn/mean": 3.7923130989074707, + "rewards/reward_fn/std": 0.6560828685760498, + "step": 3498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 71.0, + "completions/max_terminated_length": 71.0, + "completions/mean_length": 56.9375, + "completions/mean_terminated_length": 56.9375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.40408823189744775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.014466564411122818, + "learning_rate": 1.004e-06, + "loss": 0.0006, + "num_tokens": 81417843.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 80.28125, + "completions/mean_terminated_length": 80.28125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.4042037186742118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.013706429432204459, + "learning_rate": 1.0019999999999999e-06, + "loss": 0.0005, + "num_tokens": 81433180.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 209.25, + "completions/mean_terminated_length": 209.25, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.4043192054509759, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.008698127072420903, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 81464004.0, + "reward": 3.682882785797119, + "reward_std": 0.4462338089942932, + "rewards/reward_fn/mean": 3.682882785797119, + "rewards/reward_fn/std": 0.4462337791919708, + "step": 3501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 186.65625, + "completions/mean_terminated_length": 186.65625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.4044346922277399, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.010009929799707606, + "learning_rate": 9.98e-07, + "loss": 0.0004, + "num_tokens": 81494041.0, + "reward": 3.330502986907959, + "reward_std": 0.49292781949043274, + "rewards/reward_fn/mean": 3.330502986907959, + "rewards/reward_fn/std": 0.49292778968811035, + "step": 3502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 109.0, + "completions/mean_terminated_length": 109.0, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.40455017900450396, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.006669707152468618, + "learning_rate": 9.959999999999999e-07, + "loss": 0.0003, + "num_tokens": 81521273.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 226.625, + "completions/mean_terminated_length": 226.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.40466566578126806, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.014635056009865366, + "learning_rate": 9.94e-07, + "loss": 0.0006, + "num_tokens": 81542829.0, + "reward": 3.8103442192077637, + "reward_std": 0.40439069271087646, + "rewards/reward_fn/mean": 3.8103442192077637, + "rewards/reward_fn/std": 0.40439069271087646, + "step": 3504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 407.625, + "completions/mean_terminated_length": 407.625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.4047811525580321, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.01075571462570224, + "learning_rate": 9.92e-07, + "loss": 0.0004, + "num_tokens": 81580737.0, + "reward": 3.3980095386505127, + "reward_std": 0.7664466500282288, + "rewards/reward_fn/mean": 3.3980095386505127, + "rewards/reward_fn/std": 0.766446590423584, + "step": 3505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 92.8125, + "completions/mean_terminated_length": 92.8125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.4048966393347962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.006236844266823027, + "learning_rate": 9.9e-07, + "loss": 0.0002, + "num_tokens": 81599931.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 193.3125, + "completions/mean_terminated_length": 193.3125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.40501212611156023, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.013589031485025771, + "learning_rate": 9.88e-07, + "loss": 0.0005, + "num_tokens": 81621861.0, + "reward": 2.844748020172119, + "reward_std": 0.0978904515504837, + "rewards/reward_fn/mean": 2.844748020172119, + "rewards/reward_fn/std": 0.09789042919874191, + "step": 3507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 161.9375, + "completions/mean_terminated_length": 161.9375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.4051276128883243, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.012027732911519706, + "learning_rate": 9.86e-07, + "loss": 0.0005, + "num_tokens": 81637795.0, + "reward": 3.930948257446289, + "reward_std": 0.3906151056289673, + "rewards/reward_fn/mean": 3.930948257446289, + "rewards/reward_fn/std": 0.3906151354312897, + "step": 3508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 101.65625, + "completions/mean_terminated_length": 101.65625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.40524309966508837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.006924982204509433, + "learning_rate": 9.84e-07, + "loss": 0.0003, + "num_tokens": 81664440.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 59.40625, + "completions/mean_terminated_length": 59.40625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.4053585864418524, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.0035834749487548834, + "learning_rate": 9.819999999999999e-07, + "loss": 0.0001, + "num_tokens": 81678117.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 266.46875, + "completions/mean_terminated_length": 266.46875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.40547407321861645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.015249535033944994, + "learning_rate": 9.8e-07, + "loss": 0.0006, + "num_tokens": 81710932.0, + "reward": 3.9216532707214355, + "reward_std": 0.24906575679779053, + "rewards/reward_fn/mean": 3.9216532707214355, + "rewards/reward_fn/std": 0.24906574189662933, + "step": 3511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 167.59375, + "completions/mean_terminated_length": 167.59375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.40558955999538054, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.008597469808591995, + "learning_rate": 9.78e-07, + "loss": 0.0003, + "num_tokens": 81729095.0, + "reward": 3.12038254737854, + "reward_std": 0.4138098359107971, + "rewards/reward_fn/mean": 3.12038254737854, + "rewards/reward_fn/std": 0.4138098657131195, + "step": 3512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 112.21875, + "completions/mean_terminated_length": 112.21875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.4057050467721446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.006421022670110688, + "learning_rate": 9.759999999999998e-07, + "loss": 0.0003, + "num_tokens": 81746414.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 225.40625, + "completions/mean_terminated_length": 225.40625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.4058205335489086, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.007473494355508592, + "learning_rate": 9.74e-07, + "loss": 0.0003, + "num_tokens": 81773307.0, + "reward": 2.7502541542053223, + "reward_std": 0.039948105812072754, + "rewards/reward_fn/mean": 2.7502541542053223, + "rewards/reward_fn/std": 0.039948124438524246, + "step": 3514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.4059360203256727, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.01088474181597121, + "learning_rate": 9.72e-07, + "loss": 0.0004, + "num_tokens": 81803915.0, + "reward": 3.7755842208862305, + "reward_std": 0.5319482684135437, + "rewards/reward_fn/mean": 3.7755842208862305, + "rewards/reward_fn/std": 0.5319482684135437, + "step": 3515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 395.21875, + "completions/mean_terminated_length": 395.21875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.40605150710243676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.010457875716383569, + "learning_rate": 9.7e-07, + "loss": 0.0004, + "num_tokens": 81844530.0, + "reward": 3.1087074279785156, + "reward_std": 0.8228812217712402, + "rewards/reward_fn/mean": 3.1087074279785156, + "rewards/reward_fn/std": 0.8228812217712402, + "step": 3516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 169.65625, + "completions/mean_terminated_length": 169.65625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.40616699387920085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638671875, + "kl": 0.008115304153761826, + "learning_rate": 9.679999999999999e-07, + "loss": 0.0003, + "num_tokens": 81863111.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 123.15625, + "completions/mean_terminated_length": 123.15625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.4062824806559649, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.009814305642066756, + "learning_rate": 9.66e-07, + "loss": 0.0004, + "num_tokens": 81883084.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 205.6875, + "completions/mean_terminated_length": 205.6875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.40639796743272893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.00844603635778185, + "learning_rate": 9.64e-07, + "loss": 0.0003, + "num_tokens": 81897634.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 139.03125, + "completions/mean_terminated_length": 139.03125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.40651345420949303, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.0073275249160360545, + "learning_rate": 9.619999999999999e-07, + "loss": 0.0003, + "num_tokens": 81918019.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 293.8125, + "completions/mean_terminated_length": 293.8125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.40662894098625707, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.008898912943550386, + "learning_rate": 9.6e-07, + "loss": 0.0004, + "num_tokens": 81935581.0, + "reward": 3.9014487266540527, + "reward_std": 0.4221985638141632, + "rewards/reward_fn/mean": 3.9014487266540527, + "rewards/reward_fn/std": 0.4221985638141632, + "step": 3521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 232.0625, + "completions/mean_terminated_length": 232.0625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.4067444277630211, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.013016983211855404, + "learning_rate": 9.58e-07, + "loss": 0.0005, + "num_tokens": 81956799.0, + "reward": 3.925616979598999, + "reward_std": 0.29273688793182373, + "rewards/reward_fn/mean": 3.925616979598999, + "rewards/reward_fn/std": 0.29273688793182373, + "step": 3522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 118.625, + "completions/mean_terminated_length": 118.625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4068599145397852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.011286576060228981, + "learning_rate": 9.559999999999998e-07, + "loss": 0.0005, + "num_tokens": 81981203.0, + "reward": 3.881507158279419, + "reward_std": 0.22822131216526031, + "rewards/reward_fn/mean": 3.881507158279419, + "rewards/reward_fn/std": 0.2282213717699051, + "step": 3523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 99.21875, + "completions/mean_terminated_length": 99.21875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.40697540131654925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.01442226703511551, + "learning_rate": 9.539999999999999e-07, + "loss": 0.0006, + "num_tokens": 82007962.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 327.1875, + "completions/mean_terminated_length": 327.1875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.40709088809331334, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.013910518784541637, + "learning_rate": 9.52e-07, + "loss": 0.0006, + "num_tokens": 82031392.0, + "reward": 2.941704273223877, + "reward_std": 0.3659498393535614, + "rewards/reward_fn/mean": 2.941704273223877, + "rewards/reward_fn/std": 0.365949809551239, + "step": 3525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 80.78125, + "completions/mean_terminated_length": 80.78125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.4072063748700774, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.171875, + "kl": 0.031279669296054635, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0013, + "num_tokens": 82050937.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 118.4375, + "completions/mean_terminated_length": 118.4375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.4073218616468414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.010957897669868544, + "learning_rate": 9.479999999999999e-07, + "loss": 0.0004, + "num_tokens": 82072583.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1231.0, + "completions/max_terminated_length": 1231.0, + "completions/mean_length": 411.28125, + "completions/mean_terminated_length": 411.28125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.4074373484236055, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.01135617881664075, + "learning_rate": 9.459999999999999e-07, + "loss": 0.0005, + "num_tokens": 82107824.0, + "reward": 2.9164676666259766, + "reward_std": 0.7547491192817688, + "rewards/reward_fn/mean": 2.9164676666259766, + "rewards/reward_fn/std": 0.7547491192817688, + "step": 3528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 291.40625, + "completions/mean_terminated_length": 291.40625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.40755283520036956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.010084021101647522, + "learning_rate": 9.439999999999999e-07, + "loss": 0.0004, + "num_tokens": 82141405.0, + "reward": 3.977590799331665, + "reward_std": 0.1267659068107605, + "rewards/reward_fn/mean": 3.977590799331665, + "rewards/reward_fn/std": 0.1267659068107605, + "step": 3529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 266.0625, + "completions/mean_terminated_length": 266.0625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.4076683219771336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.0054612177482340485, + "learning_rate": 9.419999999999999e-07, + "loss": 0.0002, + "num_tokens": 82165151.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 211.90625, + "completions/mean_terminated_length": 211.90625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.4077838087538977, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.009113801199418958, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0004, + "num_tokens": 82195164.0, + "reward": 3.157322645187378, + "reward_std": 0.4231955111026764, + "rewards/reward_fn/mean": 3.157322645187378, + "rewards/reward_fn/std": 0.4231955409049988, + "step": 3531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 154.75, + "completions/mean_terminated_length": 154.75, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.40789929553066173, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.0073474012679071166, + "learning_rate": 9.379999999999998e-07, + "loss": 0.0003, + "num_tokens": 82213972.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 107.5625, + "completions/mean_terminated_length": 107.5625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.4080147823074258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041259765625, + "kl": 0.0029353017771427403, + "learning_rate": 9.36e-07, + "loss": 0.0001, + "num_tokens": 82240614.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 130.96875, + "completions/mean_terminated_length": 130.96875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.40813026908418987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.007675628519791644, + "learning_rate": 9.34e-07, + "loss": 0.0003, + "num_tokens": 82264805.0, + "reward": 2.9063687324523926, + "reward_std": 0.05519195646047592, + "rewards/reward_fn/mean": 2.9063687324523926, + "rewards/reward_fn/std": 0.05519195646047592, + "step": 3534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 124.1875, + "completions/mean_terminated_length": 124.1875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.4082457558609539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1708984375, + "kl": 0.017459375732869375, + "learning_rate": 9.32e-07, + "loss": 0.0007, + "num_tokens": 82279467.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 89.125, + "completions/mean_terminated_length": 89.125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.408361242637718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.004348043961726944, + "learning_rate": 9.3e-07, + "loss": 0.0002, + "num_tokens": 82301039.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 76.84375, + "completions/mean_terminated_length": 76.84375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.40847672941448204, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.828125, + "kl": 0.01743472811722313, + "learning_rate": 9.28e-07, + "loss": 0.0007, + "num_tokens": 82316874.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 265.15625, + "completions/mean_terminated_length": 265.15625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.4085922161912461, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.010422719191410579, + "learning_rate": 9.26e-07, + "loss": 0.0004, + "num_tokens": 82343055.0, + "reward": 2.9292640686035156, + "reward_std": 0.22257249057292938, + "rewards/reward_fn/mean": 2.9292640686035156, + "rewards/reward_fn/std": 0.2225724756717682, + "step": 3538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 109.03125, + "completions/mean_terminated_length": 109.03125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.4087077029680102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.008219552251830464, + "learning_rate": 9.24e-07, + "loss": 0.0003, + "num_tokens": 82367120.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 120.0625, + "completions/mean_terminated_length": 120.0625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.4088231897447742, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.020856336719589308, + "learning_rate": 9.22e-07, + "loss": 0.0008, + "num_tokens": 82383954.0, + "reward": 3.9642438888549805, + "reward_std": 0.11538804322481155, + "rewards/reward_fn/mean": 3.9642438888549805, + "rewards/reward_fn/std": 0.11538804322481155, + "step": 3540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 200.15625, + "completions/mean_terminated_length": 200.15625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.40893867652153826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.016297925089020282, + "learning_rate": 9.2e-07, + "loss": 0.0007, + "num_tokens": 82407095.0, + "reward": 3.183784008026123, + "reward_std": 0.2549075186252594, + "rewards/reward_fn/mean": 3.183784008026123, + "rewards/reward_fn/std": 0.254907488822937, + "step": 3541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 210.71875, + "completions/mean_terminated_length": 210.71875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.40905416329830235, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.011118903013993986, + "learning_rate": 9.18e-07, + "loss": 0.0004, + "num_tokens": 82427950.0, + "reward": 3.7882652282714844, + "reward_std": 0.6689299941062927, + "rewards/reward_fn/mean": 3.7882652282714844, + "rewards/reward_fn/std": 0.6689299941062927, + "step": 3542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 150.65625, + "completions/mean_terminated_length": 150.65625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.4091696500750664, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "kl": 0.01347092486685142, + "learning_rate": 9.16e-07, + "loss": 0.0005, + "num_tokens": 82445123.0, + "reward": 3.676971912384033, + "reward_std": 0.48782673478126526, + "rewards/reward_fn/mean": 3.676971912384033, + "rewards/reward_fn/std": 0.48782673478126526, + "step": 3543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 119.84375, + "completions/mean_terminated_length": 119.84375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.4092851368518305, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.017924428757396527, + "learning_rate": 9.14e-07, + "loss": 0.0007, + "num_tokens": 82479134.0, + "reward": 3.524103879928589, + "reward_std": 0.12307426333427429, + "rewards/reward_fn/mean": 3.524103879928589, + "rewards/reward_fn/std": 0.1230742558836937, + "step": 3544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 196.40625, + "completions/mean_terminated_length": 196.40625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.40940062362859453, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.02000596816651523, + "learning_rate": 9.12e-07, + "loss": 0.0008, + "num_tokens": 82506347.0, + "reward": 3.801792621612549, + "reward_std": 0.32584235072135925, + "rewards/reward_fn/mean": 3.801792621612549, + "rewards/reward_fn/std": 0.32584232091903687, + "step": 3545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 345.40625, + "completions/mean_terminated_length": 345.40625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.40951611040535857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.007443603761203121, + "learning_rate": 9.1e-07, + "loss": 0.0003, + "num_tokens": 82542104.0, + "reward": 2.6662306785583496, + "reward_std": 0.25813060998916626, + "rewards/reward_fn/mean": 2.6662306785583496, + "rewards/reward_fn/std": 0.25813063979148865, + "step": 3546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 319.90625, + "completions/mean_terminated_length": 319.90625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.40963159718212266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.006589930424524937, + "learning_rate": 9.08e-07, + "loss": 0.0003, + "num_tokens": 82565013.0, + "reward": 3.2486765384674072, + "reward_std": 0.4046629071235657, + "rewards/reward_fn/mean": 3.2486765384674072, + "rewards/reward_fn/std": 0.4046628773212433, + "step": 3547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 227.65625, + "completions/mean_terminated_length": 227.65625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.4097470839588867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042724609375, + "kl": 0.006829101726907538, + "learning_rate": 9.06e-07, + "loss": 0.0003, + "num_tokens": 82586634.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1370.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 675.90625, + "completions/mean_terminated_length": 675.90625, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.40986257073565074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.0076429428736446425, + "learning_rate": 9.039999999999999e-07, + "loss": 0.0003, + "num_tokens": 82620807.0, + "reward": 3.634036064147949, + "reward_std": 0.8641049265861511, + "rewards/reward_fn/mean": 3.634036064147949, + "rewards/reward_fn/std": 0.8641048073768616, + "step": 3549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 215.0, + "completions/mean_terminated_length": 215.0, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.40997805751241484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.007226528483442962, + "learning_rate": 9.02e-07, + "loss": 0.0003, + "num_tokens": 82641927.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 159.75, + "completions/mean_terminated_length": 159.75, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.4100935442891789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.009334985275927465, + "learning_rate": 9e-07, + "loss": 0.0004, + "num_tokens": 82662047.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 239.03125, + "completions/mean_terminated_length": 239.03125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.410209031065943, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.008921982072934043, + "learning_rate": 8.98e-07, + "loss": 0.0004, + "num_tokens": 82684800.0, + "reward": 3.9713873863220215, + "reward_std": 0.1618572324514389, + "rewards/reward_fn/mean": 3.9713873863220215, + "rewards/reward_fn/std": 0.1618572175502777, + "step": 3552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 162.96875, + "completions/mean_terminated_length": 162.96875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.410324517842707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30859375, + "kl": 0.027630938464426436, + "learning_rate": 8.96e-07, + "loss": 0.0011, + "num_tokens": 82709375.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 221.375, + "completions/mean_terminated_length": 221.375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.41044000461947105, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.008404529886320233, + "learning_rate": 8.939999999999999e-07, + "loss": 0.0003, + "num_tokens": 82724235.0, + "reward": 3.977022647857666, + "reward_std": 0.1299787312746048, + "rewards/reward_fn/mean": 3.977022647857666, + "rewards/reward_fn/std": 0.1299787312746048, + "step": 3554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 170.625, + "completions/mean_terminated_length": 170.625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.41055549139623515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.009749490825925022, + "learning_rate": 8.92e-07, + "loss": 0.0004, + "num_tokens": 82741983.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 245.9375, + "completions/mean_terminated_length": 245.9375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.4106709781729992, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.007694671527133323, + "learning_rate": 8.9e-07, + "loss": 0.0003, + "num_tokens": 82772989.0, + "reward": 3.8568928241729736, + "reward_std": 0.5239648222923279, + "rewards/reward_fn/mean": 3.8568928241729736, + "rewards/reward_fn/std": 0.5239647626876831, + "step": 3556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 95.46875, + "completions/mean_terminated_length": 95.46875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.41078646494976323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.006706615626171697, + "learning_rate": 8.88e-07, + "loss": 0.0003, + "num_tokens": 82786636.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 59.53125, + "completions/mean_terminated_length": 59.53125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.4109019517265273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15625, + "kl": 0.011266499732300872, + "learning_rate": 8.86e-07, + "loss": 0.0005, + "num_tokens": 82804189.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 128.5625, + "completions/mean_terminated_length": 128.5625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.41101743850329137, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.008813279615424108, + "learning_rate": 8.839999999999999e-07, + "loss": 0.0004, + "num_tokens": 82824687.0, + "reward": 2.9729185104370117, + "reward_std": 0.06710214912891388, + "rewards/reward_fn/mean": 2.9729185104370117, + "rewards/reward_fn/std": 0.06710217148065567, + "step": 3559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 234.25, + "completions/mean_terminated_length": 234.25, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.41113292528005546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.011042733625799883, + "learning_rate": 8.82e-07, + "loss": 0.0004, + "num_tokens": 82858967.0, + "reward": 2.869403839111328, + "reward_std": 0.3755526542663574, + "rewards/reward_fn/mean": 2.869403839111328, + "rewards/reward_fn/std": 0.3755527138710022, + "step": 3560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 177.96875, + "completions/mean_terminated_length": 177.96875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.4112484120568195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.00835727413505083, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0003, + "num_tokens": 82879734.0, + "reward": 3.9389853477478027, + "reward_std": 0.24009397625923157, + "rewards/reward_fn/mean": 3.9389853477478027, + "rewards/reward_fn/std": 0.24009396135807037, + "step": 3561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 121.3125, + "completions/mean_terminated_length": 121.3125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.41136389883358354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.008549743623007089, + "learning_rate": 8.78e-07, + "loss": 0.0003, + "num_tokens": 82902048.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 71.4375, + "completions/mean_terminated_length": 71.4375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.41147938561034764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.0041998173746833345, + "learning_rate": 8.76e-07, + "loss": 0.0002, + "num_tokens": 82924430.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 204.78125, + "completions/mean_terminated_length": 204.78125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.4115948723871117, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.011413693617214449, + "learning_rate": 8.739999999999999e-07, + "loss": 0.0005, + "num_tokens": 82949031.0, + "reward": 3.977821111679077, + "reward_std": 0.1254623830318451, + "rewards/reward_fn/mean": 3.977821111679077, + "rewards/reward_fn/std": 0.1254623532295227, + "step": 3564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 179.5, + "completions/mean_terminated_length": 179.5, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.4117103591638757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051513671875, + "kl": 0.007243617430503946, + "learning_rate": 8.72e-07, + "loss": 0.0003, + "num_tokens": 82970103.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.4118258459406398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.011081719829235226, + "learning_rate": 8.699999999999999e-07, + "loss": 0.0004, + "num_tokens": 82989323.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 224.25, + "completions/mean_terminated_length": 224.25, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.41194133271740385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.0075252479073242284, + "learning_rate": 8.68e-07, + "loss": 0.0003, + "num_tokens": 83019475.0, + "reward": 2.9999048709869385, + "reward_std": 0.22745957970619202, + "rewards/reward_fn/mean": 2.9999048709869385, + "rewards/reward_fn/std": 0.2274596095085144, + "step": 3567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 377.71875, + "completions/mean_terminated_length": 377.71875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.4120568194941679, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.013715397639316507, + "learning_rate": 8.659999999999999e-07, + "loss": 0.0005, + "num_tokens": 83043594.0, + "reward": 3.7247161865234375, + "reward_std": 0.7399970293045044, + "rewards/reward_fn/mean": 3.7247161865234375, + "rewards/reward_fn/std": 0.7399970889091492, + "step": 3568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 293.8125, + "completions/mean_terminated_length": 293.8125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.412172306270932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.010762714300653897, + "learning_rate": 8.639999999999999e-07, + "loss": 0.0004, + "num_tokens": 83067652.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 122.625, + "completions/mean_terminated_length": 122.625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.412287793047696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.01186864146438893, + "learning_rate": 8.62e-07, + "loss": 0.0005, + "num_tokens": 83099992.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 165.125, + "completions/mean_terminated_length": 165.125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.4124032798244601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.007025521699688397, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0003, + "num_tokens": 83126876.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 248.90625, + "completions/mean_terminated_length": 248.90625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.41251876660122416, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.009129902507993393, + "learning_rate": 8.58e-07, + "loss": 0.0004, + "num_tokens": 83153209.0, + "reward": 3.836095094680786, + "reward_std": 0.49457216262817383, + "rewards/reward_fn/mean": 3.836095094680786, + "rewards/reward_fn/std": 0.49457216262817383, + "step": 3572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 208.625, + "completions/mean_terminated_length": 208.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.4126342533779882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.010183456164668314, + "learning_rate": 8.559999999999999e-07, + "loss": 0.0004, + "num_tokens": 83179117.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 118.125, + "completions/mean_terminated_length": 118.125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.4127497401547523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.011475124119897373, + "learning_rate": 8.539999999999999e-07, + "loss": 0.0005, + "num_tokens": 83198865.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 263.71875, + "completions/mean_terminated_length": 263.71875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.41286522693151634, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041259765625, + "kl": 0.008149833367497195, + "learning_rate": 8.52e-07, + "loss": 0.0003, + "num_tokens": 83223464.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 65.125, + "completions/mean_terminated_length": 65.125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.4129807137082804, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.005693426537618507, + "learning_rate": 8.499999999999999e-07, + "loss": 0.0002, + "num_tokens": 83251884.0, + "reward": 3.660393238067627, + "reward_std": 0.16054117679595947, + "rewards/reward_fn/mean": 3.660393238067627, + "rewards/reward_fn/std": 0.16054119169712067, + "step": 3576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 161.84375, + "completions/mean_terminated_length": 161.84375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.4130962004850445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.006401250349881593, + "learning_rate": 8.48e-07, + "loss": 0.0003, + "num_tokens": 83276679.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 105.125, + "completions/mean_terminated_length": 105.125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.4132116872618085, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "kl": 0.008378864382393658, + "learning_rate": 8.459999999999999e-07, + "loss": 0.0003, + "num_tokens": 83297771.0, + "reward": 3.7916982173919678, + "reward_std": 0.658028781414032, + "rewards/reward_fn/mean": 3.7916982173919678, + "rewards/reward_fn/std": 0.658028781414032, + "step": 3578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 212.34375, + "completions/mean_terminated_length": 212.34375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.4133271740385726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048583984375, + "kl": 0.007215318335511256, + "learning_rate": 8.439999999999999e-07, + "loss": 0.0003, + "num_tokens": 83312822.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 193.09375, + "completions/mean_terminated_length": 193.09375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.41344266081533665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.008132506834954256, + "learning_rate": 8.419999999999999e-07, + "loss": 0.0003, + "num_tokens": 83342393.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1051.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 436.4375, + "completions/mean_terminated_length": 436.4375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.4135581475921007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.007497093283745926, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0003, + "num_tokens": 83371527.0, + "reward": 2.745147705078125, + "reward_std": 0.2176670879125595, + "rewards/reward_fn/mean": 2.745147705078125, + "rewards/reward_fn/std": 0.21766704320907593, + "step": 3581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.4136736343688648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.011734200168575626, + "learning_rate": 8.38e-07, + "loss": 0.0005, + "num_tokens": 83398063.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 252.84375, + "completions/mean_terminated_length": 252.84375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.4137891211456288, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0380859375, + "kl": 0.006063758752134163, + "learning_rate": 8.359999999999999e-07, + "loss": 0.0002, + "num_tokens": 83430826.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 139.4375, + "completions/mean_terminated_length": 139.4375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.41390460792239286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.008815014545689337, + "learning_rate": 8.34e-07, + "loss": 0.0004, + "num_tokens": 83445080.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 297.46875, + "completions/mean_terminated_length": 297.46875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.41402009469915696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035400390625, + "kl": 0.009200901869917288, + "learning_rate": 8.319999999999999e-07, + "loss": 0.0004, + "num_tokens": 83469575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 85.34375, + "completions/mean_terminated_length": 85.34375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.414135581475921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.005416645712102763, + "learning_rate": 8.299999999999999e-07, + "loss": 0.0002, + "num_tokens": 83494034.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 103.1875, + "completions/mean_terminated_length": 103.1875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.4142510682526851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.013415421650279313, + "learning_rate": 8.28e-07, + "loss": 0.0005, + "num_tokens": 83518680.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 110.40625, + "completions/mean_terminated_length": 110.40625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.41436655502944914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.008760202283156104, + "learning_rate": 8.259999999999999e-07, + "loss": 0.0004, + "num_tokens": 83546277.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 54.9375, + "completions/mean_terminated_length": 54.9375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.4144820418062132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.0058676279077189974, + "learning_rate": 8.24e-07, + "loss": 0.0002, + "num_tokens": 83566371.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 139.15625, + "completions/mean_terminated_length": 139.15625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.41459752858297727, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.009781085580470972, + "learning_rate": 8.219999999999999e-07, + "loss": 0.0004, + "num_tokens": 83586792.0, + "reward": 3.264495372772217, + "reward_std": 0.33476880192756653, + "rewards/reward_fn/mean": 3.264495372772217, + "rewards/reward_fn/std": 0.33476880192756653, + "step": 3590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 93.03125, + "completions/mean_terminated_length": 93.03125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.4147130153597413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04345703125, + "kl": 0.0038959724188316613, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0002, + "num_tokens": 83599465.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 258.84375, + "completions/mean_terminated_length": 258.84375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.41482850213650535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.011803634697571397, + "learning_rate": 8.179999999999999e-07, + "loss": 0.0005, + "num_tokens": 83618980.0, + "reward": 3.4260411262512207, + "reward_std": 0.3138890266418457, + "rewards/reward_fn/mean": 3.4260411262512207, + "rewards/reward_fn/std": 0.3138890564441681, + "step": 3592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 257.1875, + "completions/mean_terminated_length": 257.1875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.41494398891326945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0400390625, + "kl": 0.006914149056683527, + "learning_rate": 8.159999999999999e-07, + "loss": 0.0003, + "num_tokens": 83641994.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 148.96875, + "completions/mean_terminated_length": 148.96875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.4150594756900335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.00870513803238282, + "learning_rate": 8.14e-07, + "loss": 0.0003, + "num_tokens": 83663113.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 141.5625, + "completions/mean_terminated_length": 141.5625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.4151749624667975, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.024877507355995476, + "learning_rate": 8.12e-07, + "loss": 0.001, + "num_tokens": 83677019.0, + "reward": 3.15458345413208, + "reward_std": 0.32512062788009644, + "rewards/reward_fn/mean": 3.15458345413208, + "rewards/reward_fn/std": 0.3251206874847412, + "step": 3595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 329.15625, + "completions/mean_terminated_length": 329.15625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.4152904492435616, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.010153786948649213, + "learning_rate": 8.1e-07, + "loss": 0.0004, + "num_tokens": 83710688.0, + "reward": 3.8935375213623047, + "reward_std": 0.33717888593673706, + "rewards/reward_fn/mean": 3.8935375213623047, + "rewards/reward_fn/std": 0.33717894554138184, + "step": 3596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 91.375, + "completions/mean_terminated_length": 91.375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.41540593602032566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.004834311042941408, + "learning_rate": 8.08e-07, + "loss": 0.0002, + "num_tokens": 83735404.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 129.625, + "completions/mean_terminated_length": 129.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.41552142279708976, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.018607558304211125, + "learning_rate": 8.06e-07, + "loss": 0.0007, + "num_tokens": 83761600.0, + "reward": 3.850620985031128, + "reward_std": 0.4945065975189209, + "rewards/reward_fn/mean": 3.850620985031128, + "rewards/reward_fn/std": 0.4945066273212433, + "step": 3598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 83.34375, + "completions/mean_terminated_length": 83.34375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.4156369095738538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.007808444897818845, + "learning_rate": 8.04e-07, + "loss": 0.0003, + "num_tokens": 83782283.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 135.03125, + "completions/mean_terminated_length": 135.03125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.41575239635061784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.01671973682823591, + "learning_rate": 8.02e-07, + "loss": 0.0007, + "num_tokens": 83799820.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 119.875, + "completions/mean_terminated_length": 119.875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.41586788312738193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.006954975899134297, + "learning_rate": 8e-07, + "loss": 0.0003, + "num_tokens": 83830184.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 85.34375, + "completions/mean_terminated_length": 85.34375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.41598336990414597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03857421875, + "kl": 0.003794708295572491, + "learning_rate": 7.98e-07, + "loss": 0.0002, + "num_tokens": 83845043.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 136.4375, + "completions/mean_terminated_length": 136.4375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.41609885668091, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.01196233747759834, + "learning_rate": 7.96e-07, + "loss": 0.0005, + "num_tokens": 83864161.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 121.3125, + "completions/mean_terminated_length": 121.3125, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.4162143434576741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.009022326750709908, + "learning_rate": 7.94e-07, + "loss": 0.0004, + "num_tokens": 83892811.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 250.59375, + "completions/mean_terminated_length": 250.59375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.41632983023443815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.007632405526237562, + "learning_rate": 7.92e-07, + "loss": 0.0003, + "num_tokens": 83924862.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 216.75, + "completions/mean_terminated_length": 216.75, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.41644531701120224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.014339761110022664, + "learning_rate": 7.9e-07, + "loss": 0.0006, + "num_tokens": 83953302.0, + "reward": 3.877972364425659, + "reward_std": 0.29179006814956665, + "rewards/reward_fn/mean": 3.877972364425659, + "rewards/reward_fn/std": 0.29179006814956665, + "step": 3606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 207.0625, + "completions/mean_terminated_length": 207.0625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.4165608037879663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.00760897954023676, + "learning_rate": 7.88e-07, + "loss": 0.0003, + "num_tokens": 83982456.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 58.875, + "completions/mean_terminated_length": 58.875, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.4166762905647303, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.09375, + "kl": 0.02839779779969831, + "learning_rate": 7.86e-07, + "loss": 0.0011, + "num_tokens": 83994388.0, + "reward": 3.75, + "reward_std": 0.9837387204170227, + "rewards/reward_fn/mean": 3.75, + "rewards/reward_fn/std": 0.9837387204170227, + "step": 3608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1690.0, + "completions/max_terminated_length": 1690.0, + "completions/mean_length": 405.90625, + "completions/mean_terminated_length": 405.90625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.4167917773414944, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.013095621077809483, + "learning_rate": 7.84e-07, + "loss": 0.0005, + "num_tokens": 84020625.0, + "reward": 2.9007272720336914, + "reward_std": 0.5379204750061035, + "rewards/reward_fn/mean": 2.9007272720336914, + "rewards/reward_fn/std": 0.5379204154014587, + "step": 3609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 233.46875, + "completions/mean_terminated_length": 233.46875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.41690726411825846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.011209305012016557, + "learning_rate": 7.82e-07, + "loss": 0.0004, + "num_tokens": 84040576.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 129.03125, + "completions/mean_terminated_length": 129.03125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.4170227508950225, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.013307152097695507, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0005, + "num_tokens": 84073729.0, + "reward": 3.96871018409729, + "reward_std": 0.17700161039829254, + "rewards/reward_fn/mean": 3.96871018409729, + "rewards/reward_fn/std": 0.17700158059597015, + "step": 3611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 148.46875, + "completions/mean_terminated_length": 148.46875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.4171382376717866, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.005376807210268453, + "learning_rate": 7.78e-07, + "loss": 0.0002, + "num_tokens": 84090096.0, + "reward": 3.969968795776367, + "reward_std": 0.16988199949264526, + "rewards/reward_fn/mean": 3.969968795776367, + "rewards/reward_fn/std": 0.16988196969032288, + "step": 3612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 184.875, + "completions/mean_terminated_length": 184.875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.41725372444855063, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.010035388942924328, + "learning_rate": 7.76e-07, + "loss": 0.0004, + "num_tokens": 84120204.0, + "reward": 3.130436420440674, + "reward_std": 0.16480287909507751, + "rewards/reward_fn/mean": 3.130436420440674, + "rewards/reward_fn/std": 0.16480284929275513, + "step": 3613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 86.6875, + "completions/mean_terminated_length": 86.6875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.41736921122531473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.00702034740606905, + "learning_rate": 7.74e-07, + "loss": 0.0003, + "num_tokens": 84147906.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 223.1875, + "completions/mean_terminated_length": 223.1875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.41748469800207877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048583984375, + "kl": 0.007949963328428566, + "learning_rate": 7.72e-07, + "loss": 0.0003, + "num_tokens": 84170760.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 126.84375, + "completions/mean_terminated_length": 126.84375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.4176001847788428, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.011048370288335718, + "learning_rate": 7.699999999999999e-07, + "loss": 0.0004, + "num_tokens": 84190819.0, + "reward": 3.984360694885254, + "reward_std": 0.08846892416477203, + "rewards/reward_fn/mean": 3.984360694885254, + "rewards/reward_fn/std": 0.08846888691186905, + "step": 3616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 262.25, + "completions/mean_terminated_length": 262.25, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.4177156715556069, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.0112504795397399, + "learning_rate": 7.68e-07, + "loss": 0.0005, + "num_tokens": 84216267.0, + "reward": 3.0882534980773926, + "reward_std": 0.48313215374946594, + "rewards/reward_fn/mean": 3.0882534980773926, + "rewards/reward_fn/std": 0.48313212394714355, + "step": 3617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 283.65625, + "completions/mean_terminated_length": 283.65625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.41783115833237094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.008696334043634124, + "learning_rate": 7.66e-07, + "loss": 0.0003, + "num_tokens": 84240704.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 89.1875, + "completions/mean_terminated_length": 89.1875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.417946645109135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.0116143352133804, + "learning_rate": 7.64e-07, + "loss": 0.0005, + "num_tokens": 84265798.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 208.4375, + "completions/mean_terminated_length": 208.4375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.4180621318858991, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.00863961772120092, + "learning_rate": 7.62e-07, + "loss": 0.0003, + "num_tokens": 84291700.0, + "reward": 3.944368362426758, + "reward_std": 0.22624292969703674, + "rewards/reward_fn/mean": 3.944368362426758, + "rewards/reward_fn/std": 0.22624295949935913, + "step": 3620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 193.375, + "completions/mean_terminated_length": 193.375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.4181776186626631, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039306640625, + "kl": 0.007097473098838236, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0003, + "num_tokens": 84312992.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 78.59375, + "completions/mean_terminated_length": 78.59375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.41829310543942716, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.013942876314104069, + "learning_rate": 7.58e-07, + "loss": 0.0006, + "num_tokens": 84328339.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 177.96875, + "completions/mean_terminated_length": 177.96875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.41840859221619126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05712890625, + "kl": 0.010187761559791397, + "learning_rate": 7.559999999999999e-07, + "loss": 0.0004, + "num_tokens": 84358706.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1743.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 425.75, + "completions/mean_terminated_length": 425.75, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.4185240789929553, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.009107843216042966, + "learning_rate": 7.54e-07, + "loss": 0.0004, + "num_tokens": 84383722.0, + "reward": 3.8592758178710938, + "reward_std": 0.5537682771682739, + "rewards/reward_fn/mean": 3.8592758178710938, + "rewards/reward_fn/std": 0.5537682771682739, + "step": 3624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 144.3125, + "completions/mean_terminated_length": 144.3125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.4186395657697194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.00778976265428355, + "learning_rate": 7.52e-07, + "loss": 0.0003, + "num_tokens": 84406292.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 109.09375, + "completions/mean_terminated_length": 109.09375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.41875505254648343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03955078125, + "kl": 0.0032738052013883134, + "learning_rate": 7.5e-07, + "loss": 0.0001, + "num_tokens": 84431223.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 73.28125, + "completions/mean_terminated_length": 73.28125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.41887053932324747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1787109375, + "kl": 0.02181335465866141, + "learning_rate": 7.48e-07, + "loss": 0.0009, + "num_tokens": 84452672.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 117.9375, + "completions/mean_terminated_length": 117.9375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.41898602610001157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.008915351034374908, + "learning_rate": 7.459999999999999e-07, + "loss": 0.0004, + "num_tokens": 84473630.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 365.0625, + "completions/mean_terminated_length": 365.0625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.4191015128767756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0341796875, + "kl": 0.007241148050525226, + "learning_rate": 7.44e-07, + "loss": 0.0003, + "num_tokens": 84495008.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 215.03125, + "completions/mean_terminated_length": 215.03125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.41921699965353965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.008783750956354197, + "learning_rate": 7.42e-07, + "loss": 0.0004, + "num_tokens": 84517825.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 194.3125, + "completions/mean_terminated_length": 194.3125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.41933248643030374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.017441865042201243, + "learning_rate": 7.4e-07, + "loss": 0.0007, + "num_tokens": 84546955.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 251.15625, + "completions/mean_terminated_length": 251.15625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.4194479732070678, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.011954636487644166, + "learning_rate": 7.38e-07, + "loss": 0.0005, + "num_tokens": 84568112.0, + "reward": 3.6489384174346924, + "reward_std": 0.8289499878883362, + "rewards/reward_fn/mean": 3.6489384174346924, + "rewards/reward_fn/std": 0.8289499878883362, + "step": 3632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1754.0, + "completions/max_terminated_length": 1754.0, + "completions/mean_length": 396.625, + "completions/mean_terminated_length": 396.625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.4195634599838319, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.014787713706027716, + "learning_rate": 7.359999999999999e-07, + "loss": 0.0006, + "num_tokens": 84602052.0, + "reward": 3.414865016937256, + "reward_std": 0.8343104124069214, + "rewards/reward_fn/mean": 3.414865016937256, + "rewards/reward_fn/std": 0.8343103528022766, + "step": 3633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 216.65625, + "completions/mean_terminated_length": 216.65625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.4196789467605959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049072265625, + "kl": 0.009635986891225912, + "learning_rate": 7.34e-07, + "loss": 0.0004, + "num_tokens": 84631961.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 100.09375, + "completions/mean_terminated_length": 100.09375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.41979443353735996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.010435360207338817, + "learning_rate": 7.319999999999999e-07, + "loss": 0.0004, + "num_tokens": 84652572.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 183.3125, + "completions/mean_terminated_length": 183.3125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.41990992031412405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.007902831785031594, + "learning_rate": 7.3e-07, + "loss": 0.0003, + "num_tokens": 84680742.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 74.53125, + "completions/mean_terminated_length": 74.53125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.4200254070908881, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.006796184665290639, + "learning_rate": 7.28e-07, + "loss": 0.0003, + "num_tokens": 84698871.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 123.71875, + "completions/mean_terminated_length": 123.71875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.42014089386765213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04345703125, + "kl": 0.005897036902752006, + "learning_rate": 7.259999999999999e-07, + "loss": 0.0002, + "num_tokens": 84721870.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 229.1875, + "completions/mean_terminated_length": 229.1875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.4202563806444162, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.01416583666286897, + "learning_rate": 7.24e-07, + "loss": 0.0006, + "num_tokens": 84754644.0, + "reward": 3.911339282989502, + "reward_std": 0.2436724454164505, + "rewards/reward_fn/mean": 3.911339282989502, + "rewards/reward_fn/std": 0.2436724454164505, + "step": 3639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 82.53125, + "completions/mean_terminated_length": 82.53125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.42037186742118027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.005925016404944472, + "learning_rate": 7.219999999999999e-07, + "loss": 0.0002, + "num_tokens": 84773861.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.0, + "completions/max_terminated_length": 100.0, + "completions/mean_length": 67.78125, + "completions/mean_terminated_length": 67.78125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.42048735419794436, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.90625, + "kl": 0.007157815534810652, + "learning_rate": 7.2e-07, + "loss": 0.0003, + "num_tokens": 84784254.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 135.0, + "completions/mean_terminated_length": 135.0, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.4206028409747084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0390625, + "kl": 0.0043398118559707655, + "learning_rate": 7.179999999999999e-07, + "loss": 0.0002, + "num_tokens": 84801182.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 259.03125, + "completions/mean_terminated_length": 259.03125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.42071832775147244, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0341796875, + "kl": 0.005715594525099732, + "learning_rate": 7.159999999999999e-07, + "loss": 0.0002, + "num_tokens": 84825663.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 123.5625, + "completions/mean_terminated_length": 123.5625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.42083381452823654, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.009669903643953148, + "learning_rate": 7.14e-07, + "loss": 0.0004, + "num_tokens": 84841585.0, + "reward": 3.8534140586853027, + "reward_std": 0.5773124694824219, + "rewards/reward_fn/mean": 3.8534140586853027, + "rewards/reward_fn/std": 0.5773124694824219, + "step": 3644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 151.6875, + "completions/mean_terminated_length": 151.6875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.4209493013050006, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.006616403334192, + "learning_rate": 7.119999999999999e-07, + "loss": 0.0003, + "num_tokens": 84863399.0, + "reward": 3.97446346282959, + "reward_std": 0.14445707201957703, + "rewards/reward_fn/mean": 3.97446346282959, + "rewards/reward_fn/std": 0.14445705711841583, + "step": 3645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 175.59375, + "completions/mean_terminated_length": 175.59375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.4210647880817646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.011093344393884763, + "learning_rate": 7.1e-07, + "loss": 0.0004, + "num_tokens": 84881210.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 95.1875, + "completions/mean_terminated_length": 95.1875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.4211802748585287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.008108614620141452, + "learning_rate": 7.079999999999999e-07, + "loss": 0.0003, + "num_tokens": 84903232.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 153.5625, + "completions/mean_terminated_length": 153.5625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.42129576163529275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.009462919930228963, + "learning_rate": 7.059999999999999e-07, + "loss": 0.0004, + "num_tokens": 84918898.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 249.15625, + "completions/mean_terminated_length": 249.15625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.4214112484120568, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.00865220942068845, + "learning_rate": 7.04e-07, + "loss": 0.0003, + "num_tokens": 84946359.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 81.5, + "completions/mean_terminated_length": 81.5, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.4215267351888209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.008188988409528974, + "learning_rate": 7.019999999999999e-07, + "loss": 0.0003, + "num_tokens": 84974343.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 167.375, + "completions/mean_terminated_length": 167.375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.42164222196558493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.013273691642098129, + "learning_rate": 7e-07, + "loss": 0.0005, + "num_tokens": 84993491.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 224.125, + "completions/mean_terminated_length": 224.125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.421757708742349, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.0157710436033085, + "learning_rate": 6.979999999999999e-07, + "loss": 0.0006, + "num_tokens": 85012375.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 235.96875, + "completions/mean_terminated_length": 235.96875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.42187319551911306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.008442699967417866, + "learning_rate": 6.959999999999999e-07, + "loss": 0.0003, + "num_tokens": 85032246.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 248.3125, + "completions/mean_terminated_length": 248.3125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.4219886822958771, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03857421875, + "kl": 0.006380549959430937, + "learning_rate": 6.939999999999999e-07, + "loss": 0.0003, + "num_tokens": 85059904.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 74.25, + "completions/mean_terminated_length": 74.25, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.4221041690726412, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.21875, + "kl": 0.01618101976782782, + "learning_rate": 6.919999999999999e-07, + "loss": 0.0006, + "num_tokens": 85073832.0, + "reward": 3.782615900039673, + "reward_std": 0.3281198740005493, + "rewards/reward_fn/mean": 3.782615900039673, + "rewards/reward_fn/std": 0.3281199038028717, + "step": 3655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 154.625, + "completions/mean_terminated_length": 154.625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.42221965584940524, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.009473399753915146, + "learning_rate": 6.9e-07, + "loss": 0.0004, + "num_tokens": 85091324.0, + "reward": 3.9301910400390625, + "reward_std": 0.394898921251297, + "rewards/reward_fn/mean": 3.9301910400390625, + "rewards/reward_fn/std": 0.39489883184432983, + "step": 3656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 390.0625, + "completions/mean_terminated_length": 390.0625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.4223351426261693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.011489888958749361, + "learning_rate": 6.879999999999999e-07, + "loss": 0.0005, + "num_tokens": 85119102.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 255.3125, + "completions/mean_terminated_length": 255.3125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.4224506294029334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.006864318369480316, + "learning_rate": 6.86e-07, + "loss": 0.0003, + "num_tokens": 85142376.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 101.15625, + "completions/mean_terminated_length": 101.15625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.4225661161796974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.007420575231662951, + "learning_rate": 6.84e-07, + "loss": 0.0003, + "num_tokens": 85156685.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 185.90625, + "completions/mean_terminated_length": 185.90625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.4226816029564615, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.018439077117363922, + "learning_rate": 6.82e-07, + "loss": 0.0007, + "num_tokens": 85174378.0, + "reward": 3.90897536277771, + "reward_std": 0.2878890037536621, + "rewards/reward_fn/mean": 3.90897536277771, + "rewards/reward_fn/std": 0.2878890037536621, + "step": 3660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 174.8125, + "completions/mean_terminated_length": 174.8125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.42279708973322555, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.007793684686475899, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0003, + "num_tokens": 85203012.0, + "reward": 2.9550325870513916, + "reward_std": 0.044256020337343216, + "rewards/reward_fn/mean": 2.9550325870513916, + "rewards/reward_fn/std": 0.04425599053502083, + "step": 3661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 179.8125, + "completions/mean_terminated_length": 179.8125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.4229125765099896, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.007096897672454361, + "learning_rate": 6.78e-07, + "loss": 0.0003, + "num_tokens": 85232990.0, + "reward": 3.422631025314331, + "reward_std": 0.3968169093132019, + "rewards/reward_fn/mean": 3.422631025314331, + "rewards/reward_fn/std": 0.3968169093132019, + "step": 3662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 121.125, + "completions/mean_terminated_length": 121.125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.4230280632867537, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.01331527403817745, + "learning_rate": 6.76e-07, + "loss": 0.0005, + "num_tokens": 85249410.0, + "reward": 3.970663547515869, + "reward_std": 0.1659516841173172, + "rewards/reward_fn/mean": 3.970663547515869, + "rewards/reward_fn/std": 0.1659516543149948, + "step": 3663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 154.71875, + "completions/mean_terminated_length": 154.71875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.4231435500635177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.008671013936691452, + "learning_rate": 6.74e-07, + "loss": 0.0003, + "num_tokens": 85273689.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 244.375, + "completions/mean_terminated_length": 244.375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.42325903684028177, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.009140754380496219, + "learning_rate": 6.72e-07, + "loss": 0.0004, + "num_tokens": 85303621.0, + "reward": 3.859187126159668, + "reward_std": 0.46944043040275574, + "rewards/reward_fn/mean": 3.859187126159668, + "rewards/reward_fn/std": 0.4694404602050781, + "step": 3665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 154.59375, + "completions/mean_terminated_length": 154.59375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.42337452361704586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.008565794298192486, + "learning_rate": 6.7e-07, + "loss": 0.0003, + "num_tokens": 85331224.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 178.34375, + "completions/mean_terminated_length": 178.34375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.4234900103938099, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.017102585959946737, + "learning_rate": 6.68e-07, + "loss": 0.0007, + "num_tokens": 85359651.0, + "reward": 3.175473213195801, + "reward_std": 0.14133283495903015, + "rewards/reward_fn/mean": 3.175473213195801, + "rewards/reward_fn/std": 0.14133282005786896, + "step": 3667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 68.0, + "completions/max_terminated_length": 68.0, + "completions/mean_length": 57.15625, + "completions/mean_terminated_length": 57.15625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.423605497170574, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2275390625, + "kl": 0.014586989102099324, + "learning_rate": 6.66e-07, + "loss": 0.0006, + "num_tokens": 85372168.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 255.5625, + "completions/mean_terminated_length": 255.5625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.42372098394733804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.012535304966149852, + "learning_rate": 6.64e-07, + "loss": 0.0005, + "num_tokens": 85396922.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 127.125, + "completions/mean_terminated_length": 127.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.4238364707241021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.006492129417893011, + "learning_rate": 6.62e-07, + "loss": 0.0003, + "num_tokens": 85424286.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 59.8125, + "completions/mean_terminated_length": 59.8125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.4239519575008662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.158203125, + "kl": 0.010704145788622554, + "learning_rate": 6.6e-07, + "loss": 0.0004, + "num_tokens": 85437816.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 69.09375, + "completions/mean_terminated_length": 69.09375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.4240674442776302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.003486653444269905, + "learning_rate": 6.58e-07, + "loss": 0.0001, + "num_tokens": 85448827.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 59.03125, + "completions/mean_terminated_length": 59.03125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.42418293105439425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.0031709155673524947, + "learning_rate": 6.56e-07, + "loss": 0.0001, + "num_tokens": 85461532.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 85.03125, + "completions/mean_terminated_length": 85.03125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.42429841783115835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.006286919446210959, + "learning_rate": 6.54e-07, + "loss": 0.0003, + "num_tokens": 85476445.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 215.65625, + "completions/mean_terminated_length": 215.65625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.4244139046079224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.00999596909241518, + "learning_rate": 6.52e-07, + "loss": 0.0004, + "num_tokens": 85493938.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 177.71875, + "completions/mean_terminated_length": 177.71875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.4245293913846864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.019189633429050446, + "learning_rate": 6.5e-07, + "loss": 0.0008, + "num_tokens": 85522569.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 86.96875, + "completions/mean_terminated_length": 86.96875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.4246448781614505, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.013444082178466488, + "learning_rate": 6.48e-07, + "loss": 0.0005, + "num_tokens": 85542728.0, + "reward": 3.9767158031463623, + "reward_std": 0.13171496987342834, + "rewards/reward_fn/mean": 3.9767158031463623, + "rewards/reward_fn/std": 0.13171492516994476, + "step": 3677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 136.375, + "completions/mean_terminated_length": 136.375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.42476036493821456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.007495803096389864, + "learning_rate": 6.46e-07, + "loss": 0.0003, + "num_tokens": 85575156.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 139.1875, + "completions/mean_terminated_length": 139.1875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.42487585171497866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.006965044696698897, + "learning_rate": 6.44e-07, + "loss": 0.0003, + "num_tokens": 85589306.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 107.8125, + "completions/mean_terminated_length": 107.8125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.4249913384917427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.005095278494991362, + "learning_rate": 6.42e-07, + "loss": 0.0002, + "num_tokens": 85614004.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 128.125, + "completions/mean_terminated_length": 128.125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.42510682526850674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.009309465327532962, + "learning_rate": 6.4e-07, + "loss": 0.0004, + "num_tokens": 85626936.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 238.125, + "completions/mean_terminated_length": 238.125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.42522231204527083, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.011859030797495507, + "learning_rate": 6.38e-07, + "loss": 0.0005, + "num_tokens": 85655484.0, + "reward": 3.6741554737091064, + "reward_std": 0.5435678362846375, + "rewards/reward_fn/mean": 3.6741554737091064, + "rewards/reward_fn/std": 0.5435677766799927, + "step": 3682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 103.75, + "completions/mean_terminated_length": 103.75, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.4253377988220349, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.009411308325070422, + "learning_rate": 6.36e-07, + "loss": 0.0004, + "num_tokens": 85677268.0, + "reward": 3.923130750656128, + "reward_std": 0.3026314973831177, + "rewards/reward_fn/mean": 3.923130750656128, + "rewards/reward_fn/std": 0.3026314675807953, + "step": 3683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 241.53125, + "completions/mean_terminated_length": 241.53125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.4254532855987989, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.014182522980263457, + "learning_rate": 6.34e-07, + "loss": 0.0006, + "num_tokens": 85707333.0, + "reward": 3.3081324100494385, + "reward_std": 0.6615033745765686, + "rewards/reward_fn/mean": 3.3081324100494385, + "rewards/reward_fn/std": 0.6615033149719238, + "step": 3684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 278.78125, + "completions/mean_terminated_length": 278.78125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.425568772375563, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.008196085276722442, + "learning_rate": 6.319999999999999e-07, + "loss": 0.0003, + "num_tokens": 85727742.0, + "reward": 3.7215306758880615, + "reward_std": 0.6351849436759949, + "rewards/reward_fn/mean": 3.7215306758880615, + "rewards/reward_fn/std": 0.6351848840713501, + "step": 3685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 88.3125, + "completions/mean_terminated_length": 88.3125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.42568425915232705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.007061982178129256, + "learning_rate": 6.3e-07, + "loss": 0.0003, + "num_tokens": 85743368.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1649.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 360.6875, + "completions/mean_terminated_length": 360.6875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.42579974592909114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.010575519903795794, + "learning_rate": 6.28e-07, + "loss": 0.0004, + "num_tokens": 85777534.0, + "reward": 3.6862857341766357, + "reward_std": 0.449146032333374, + "rewards/reward_fn/mean": 3.6862857341766357, + "rewards/reward_fn/std": 0.4491460621356964, + "step": 3687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 174.3125, + "completions/mean_terminated_length": 174.3125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.4259152327058552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044921875, + "kl": 0.006946815439732745, + "learning_rate": 6.26e-07, + "loss": 0.0003, + "num_tokens": 85798600.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 300.9375, + "completions/mean_terminated_length": 300.9375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.4260307194826192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.015173799110925756, + "learning_rate": 6.24e-07, + "loss": 0.0006, + "num_tokens": 85821702.0, + "reward": 3.861605644226074, + "reward_std": 0.5445772409439087, + "rewards/reward_fn/mean": 3.861605644226074, + "rewards/reward_fn/std": 0.5445772409439087, + "step": 3689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 95.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 68.71875, + "completions/mean_terminated_length": 68.71875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.4261462062593833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.0032850132793100784, + "learning_rate": 6.219999999999999e-07, + "loss": 0.0001, + "num_tokens": 85839741.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 155.46875, + "completions/mean_terminated_length": 155.46875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.42626169303614736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.006560531379363965, + "learning_rate": 6.2e-07, + "loss": 0.0003, + "num_tokens": 85861676.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 128.1875, + "completions/mean_terminated_length": 128.1875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.4263771798129114, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.009897393007122446, + "learning_rate": 6.18e-07, + "loss": 0.0004, + "num_tokens": 85888050.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 59.28125, + "completions/mean_terminated_length": 59.28125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.4264926665896755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.0064805185047589475, + "learning_rate": 6.16e-07, + "loss": 0.0003, + "num_tokens": 85903771.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 292.71875, + "completions/mean_terminated_length": 292.71875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.42660815336643954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.008007426200492773, + "learning_rate": 6.14e-07, + "loss": 0.0003, + "num_tokens": 85924402.0, + "reward": 3.8590915203094482, + "reward_std": 0.48604530096054077, + "rewards/reward_fn/mean": 3.8590915203094482, + "rewards/reward_fn/std": 0.486045241355896, + "step": 3694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 98.0, + "completions/mean_terminated_length": 98.0, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.42672364014320363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.008045898943237262, + "learning_rate": 6.119999999999999e-07, + "loss": 0.0003, + "num_tokens": 85952434.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 208.875, + "completions/mean_terminated_length": 208.875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.42683912691996767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.012655494443606585, + "learning_rate": 6.1e-07, + "loss": 0.0005, + "num_tokens": 85982190.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 108.40625, + "completions/mean_terminated_length": 108.40625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.4269546136967317, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.005067016332759522, + "learning_rate": 6.079999999999999e-07, + "loss": 0.0002, + "num_tokens": 86006779.0, + "reward": 3.6237738132476807, + "reward_std": 0.12515228986740112, + "rewards/reward_fn/mean": 3.6237738132476807, + "rewards/reward_fn/std": 0.12515226006507874, + "step": 3697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 164.65625, + "completions/mean_terminated_length": 164.65625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.4270701004734958, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.017018541911966167, + "learning_rate": 6.06e-07, + "loss": 0.0007, + "num_tokens": 86032528.0, + "reward": 3.3075742721557617, + "reward_std": 0.1181129440665245, + "rewards/reward_fn/mean": 3.3075742721557617, + "rewards/reward_fn/std": 0.11811289936304092, + "step": 3698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 185.4375, + "completions/mean_terminated_length": 185.4375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.42718558725025985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.008038798005145509, + "learning_rate": 6.04e-07, + "loss": 0.0003, + "num_tokens": 86052798.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 163.34375, + "completions/mean_terminated_length": 163.34375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.4273010740270239, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.01190626694005914, + "learning_rate": 6.019999999999999e-07, + "loss": 0.0005, + "num_tokens": 86069865.0, + "reward": 3.9390430450439453, + "reward_std": 0.19757123291492462, + "rewards/reward_fn/mean": 3.9390430450439453, + "rewards/reward_fn/std": 0.19757124781608582, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.427416560803788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.008338174142409116, + "learning_rate": 6e-07, + "loss": 0.0003, + "num_tokens": 86088840.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 88.21875, + "completions/mean_terminated_length": 88.21875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.427532047580552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.010948686343908776, + "learning_rate": 5.979999999999999e-07, + "loss": 0.0004, + "num_tokens": 86110575.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1168.0, + "completions/max_terminated_length": 1168.0, + "completions/mean_length": 357.21875, + "completions/mean_terminated_length": 357.21875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.42764753435731606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.011511135366163217, + "learning_rate": 5.96e-07, + "loss": 0.0005, + "num_tokens": 86144150.0, + "reward": 3.406424045562744, + "reward_std": 0.586902916431427, + "rewards/reward_fn/mean": 3.406424045562744, + "rewards/reward_fn/std": 0.586902916431427, + "step": 3703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 194.9375, + "completions/mean_terminated_length": 194.9375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.42776302113408016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.005822977404022822, + "learning_rate": 5.939999999999999e-07, + "loss": 0.0002, + "num_tokens": 86163764.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 206.71875, + "completions/mean_terminated_length": 206.71875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.4278785079108442, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.009230231247784104, + "learning_rate": 5.919999999999999e-07, + "loss": 0.0004, + "num_tokens": 86182827.0, + "reward": 2.8523049354553223, + "reward_std": 0.28127142786979675, + "rewards/reward_fn/mean": 2.8523049354553223, + "rewards/reward_fn/std": 0.28127142786979675, + "step": 3705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 197.875, + "completions/mean_terminated_length": 197.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.4279939946876083, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.01364430422836449, + "learning_rate": 5.9e-07, + "loss": 0.0005, + "num_tokens": 86204391.0, + "reward": 3.9795644283294678, + "reward_std": 0.11560137569904327, + "rewards/reward_fn/mean": 3.9795644283294678, + "rewards/reward_fn/std": 0.11560139060020447, + "step": 3706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 229.0625, + "completions/mean_terminated_length": 229.0625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.42810948146437233, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.0054821336816530675, + "learning_rate": 5.879999999999999e-07, + "loss": 0.0002, + "num_tokens": 86232233.0, + "reward": 3.9305787086486816, + "reward_std": 0.392706036567688, + "rewards/reward_fn/mean": 3.9305787086486816, + "rewards/reward_fn/std": 0.392706036567688, + "step": 3707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 224.875, + "completions/mean_terminated_length": 224.875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.4282249682411364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.010581190879747737, + "learning_rate": 5.86e-07, + "loss": 0.0004, + "num_tokens": 86256677.0, + "reward": 3.940626859664917, + "reward_std": 0.23520520329475403, + "rewards/reward_fn/mean": 3.940626859664917, + "rewards/reward_fn/std": 0.23520521819591522, + "step": 3708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 81.6875, + "completions/mean_terminated_length": 81.6875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.42834045501790047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.008198910632927436, + "learning_rate": 5.839999999999999e-07, + "loss": 0.0003, + "num_tokens": 86274267.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 153.21875, + "completions/mean_terminated_length": 153.21875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.4284559417946645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.007225709985505091, + "learning_rate": 5.819999999999999e-07, + "loss": 0.0003, + "num_tokens": 86302818.0, + "reward": 3.9781219959259033, + "reward_std": 0.12376107275485992, + "rewards/reward_fn/mean": 3.9781219959259033, + "rewards/reward_fn/std": 0.12376109510660172, + "step": 3710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 250.28125, + "completions/mean_terminated_length": 250.28125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.42857142857142855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.013966862243250944, + "learning_rate": 5.8e-07, + "loss": 0.0006, + "num_tokens": 86333387.0, + "reward": 3.036686420440674, + "reward_std": 0.3475593030452728, + "rewards/reward_fn/mean": 3.036686420440674, + "rewards/reward_fn/std": 0.3475593328475952, + "step": 3711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 97.78125, + "completions/mean_terminated_length": 97.78125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.42868691534819264, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.3125, + "kl": 0.018994477155501954, + "learning_rate": 5.779999999999999e-07, + "loss": 0.0008, + "num_tokens": 86349316.0, + "reward": 3.8446764945983887, + "reward_std": 0.2982982397079468, + "rewards/reward_fn/mean": 3.8446764945983887, + "rewards/reward_fn/std": 0.29829826951026917, + "step": 3712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 184.4375, + "completions/mean_terminated_length": 184.4375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.4288024021249567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.01308196566242259, + "learning_rate": 5.76e-07, + "loss": 0.0005, + "num_tokens": 86373650.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 217.90625, + "completions/mean_terminated_length": 217.90625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.4289178889017208, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.00970870546734659, + "learning_rate": 5.739999999999999e-07, + "loss": 0.0004, + "num_tokens": 86405839.0, + "reward": 3.975156307220459, + "reward_std": 0.14053645730018616, + "rewards/reward_fn/mean": 3.975156307220459, + "rewards/reward_fn/std": 0.14053642749786377, + "step": 3714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 142.15625, + "completions/mean_terminated_length": 142.15625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.4290333756784848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.011075319191149902, + "learning_rate": 5.719999999999999e-07, + "loss": 0.0004, + "num_tokens": 86432148.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 256.53125, + "completions/mean_terminated_length": 256.53125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.42914886245524886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040771484375, + "kl": 0.008088487658824306, + "learning_rate": 5.699999999999999e-07, + "loss": 0.0003, + "num_tokens": 86463749.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.42926434923201295, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.012457223943783902, + "learning_rate": 5.679999999999999e-07, + "loss": 0.0005, + "num_tokens": 86491861.0, + "reward": 3.842723846435547, + "reward_std": 0.4686085879802704, + "rewards/reward_fn/mean": 3.842723846435547, + "rewards/reward_fn/std": 0.4686085879802704, + "step": 3717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 187.90625, + "completions/mean_terminated_length": 187.90625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.429379836008777, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.009739542198076379, + "learning_rate": 5.66e-07, + "loss": 0.0004, + "num_tokens": 86510226.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 230.46875, + "completions/mean_terminated_length": 230.46875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.42949532278554103, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.013223903079051524, + "learning_rate": 5.639999999999999e-07, + "loss": 0.0005, + "num_tokens": 86529569.0, + "reward": 3.1050286293029785, + "reward_std": 1.099517822265625, + "rewards/reward_fn/mean": 3.1050286293029785, + "rewards/reward_fn/std": 1.099517822265625, + "step": 3719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 108.375, + "completions/mean_terminated_length": 108.375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.42961080956230513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.010067410956253298, + "learning_rate": 5.620000000000001e-07, + "loss": 0.0004, + "num_tokens": 86553965.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 314.53125, + "completions/mean_terminated_length": 314.53125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.42972629633906917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.009630409433157183, + "learning_rate": 5.6e-07, + "loss": 0.0004, + "num_tokens": 86576094.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 323.96875, + "completions/mean_terminated_length": 323.96875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.42984178311583326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.007376045759883709, + "learning_rate": 5.58e-07, + "loss": 0.0003, + "num_tokens": 86609309.0, + "reward": 3.5230889320373535, + "reward_std": 0.6319891214370728, + "rewards/reward_fn/mean": 3.5230889320373535, + "rewards/reward_fn/std": 0.6319891214370728, + "step": 3722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 184.84375, + "completions/mean_terminated_length": 184.84375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.4299572698925973, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.008172281704901252, + "learning_rate": 5.560000000000001e-07, + "loss": 0.0003, + "num_tokens": 86645144.0, + "reward": 3.9706521034240723, + "reward_std": 0.16601717472076416, + "rewards/reward_fn/mean": 3.9706521034240723, + "rewards/reward_fn/std": 0.16601718962192535, + "step": 3723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 155.75, + "completions/mean_terminated_length": 155.75, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.43007275666936134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.007268323905009311, + "learning_rate": 5.54e-07, + "loss": 0.0003, + "num_tokens": 86675088.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 136.4375, + "completions/mean_terminated_length": 136.4375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.43018824344612544, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.671875, + "kl": 0.007097366094967583, + "learning_rate": 5.520000000000001e-07, + "loss": 0.0003, + "num_tokens": 86695582.0, + "reward": 3.9312233924865723, + "reward_std": 0.3890591263771057, + "rewards/reward_fn/mean": 3.9312233924865723, + "rewards/reward_fn/std": 0.38905906677246094, + "step": 3725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 105.90625, + "completions/mean_terminated_length": 105.90625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.4303037302228895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.007944831893837545, + "learning_rate": 5.5e-07, + "loss": 0.0003, + "num_tokens": 86710011.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 129.34375, + "completions/mean_terminated_length": 129.34375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.4304192169996535, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.008858162291289773, + "learning_rate": 5.48e-07, + "loss": 0.0004, + "num_tokens": 86732390.0, + "reward": 2.9639718532562256, + "reward_std": 0.03516789525747299, + "rewards/reward_fn/mean": 2.9639718532562256, + "rewards/reward_fn/std": 0.03516789898276329, + "step": 3727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 78.0, + "completions/max_terminated_length": 78.0, + "completions/mean_length": 57.65625, + "completions/mean_terminated_length": 57.65625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.4305347037764176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2353515625, + "kl": 0.01594314133399166, + "learning_rate": 5.46e-07, + "loss": 0.0006, + "num_tokens": 86757371.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 229.15625, + "completions/mean_terminated_length": 229.15625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.43065019055318166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.00762170078814961, + "learning_rate": 5.44e-07, + "loss": 0.0003, + "num_tokens": 86780960.0, + "reward": 3.9270803928375244, + "reward_std": 0.4124952256679535, + "rewards/reward_fn/mean": 3.9270803928375244, + "rewards/reward_fn/std": 0.4124951958656311, + "step": 3729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 111.40625, + "completions/mean_terminated_length": 111.40625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.4307656773299457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.014750593836652115, + "learning_rate": 5.420000000000001e-07, + "loss": 0.0006, + "num_tokens": 86808749.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 97.9375, + "completions/mean_terminated_length": 97.9375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.4308811641067098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.010987431269313674, + "learning_rate": 5.4e-07, + "loss": 0.0004, + "num_tokens": 86830155.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 89.75, + "completions/mean_terminated_length": 89.75, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.43099665088347383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.012612153121153824, + "learning_rate": 5.38e-07, + "loss": 0.0005, + "num_tokens": 86851491.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 124.6875, + "completions/mean_terminated_length": 124.6875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.4311121376602379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.008653260687424336, + "learning_rate": 5.36e-07, + "loss": 0.0003, + "num_tokens": 86877657.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 222.5, + "completions/mean_terminated_length": 222.5, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.43122762443700197, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.00794635695638135, + "learning_rate": 5.34e-07, + "loss": 0.0003, + "num_tokens": 86903977.0, + "reward": 3.5467796325683594, + "reward_std": 0.57369065284729, + "rewards/reward_fn/mean": 3.5467796325683594, + "rewards/reward_fn/std": 0.5736905932426453, + "step": 3734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 249.8125, + "completions/mean_terminated_length": 249.8125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.431343111213766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042236328125, + "kl": 0.007464592497854028, + "learning_rate": 5.32e-07, + "loss": 0.0003, + "num_tokens": 86927075.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 68.71875, + "completions/mean_terminated_length": 68.71875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.4314585979905301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.011687922597047873, + "learning_rate": 5.3e-07, + "loss": 0.0005, + "num_tokens": 86959162.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 167.03125, + "completions/mean_terminated_length": 167.03125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.43157408476729414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.00979083721904317, + "learning_rate": 5.28e-07, + "loss": 0.0004, + "num_tokens": 86981691.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 114.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 77.46875, + "completions/mean_terminated_length": 77.46875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.4316895715440582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.004014027399534825, + "learning_rate": 5.26e-07, + "loss": 0.0002, + "num_tokens": 87007178.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 127.0, + "completions/mean_terminated_length": 127.0, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.4318050583208223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.011112876891274936, + "learning_rate": 5.24e-07, + "loss": 0.0004, + "num_tokens": 87029322.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 156.40625, + "completions/mean_terminated_length": 156.40625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.4319205450975863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.010205146885709837, + "learning_rate": 5.22e-07, + "loss": 0.0004, + "num_tokens": 87046199.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 123.34375, + "completions/mean_terminated_length": 123.34375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.4320360318743504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.00892855061829323, + "learning_rate": 5.2e-07, + "loss": 0.0004, + "num_tokens": 87063810.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 73.0, + "completions/max_terminated_length": 73.0, + "completions/mean_length": 51.65625, + "completions/mean_terminated_length": 51.65625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.43215151865111445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2578125, + "kl": 0.01773426221916452, + "learning_rate": 5.18e-07, + "loss": 0.0007, + "num_tokens": 87095095.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 349.25, + "completions/mean_terminated_length": 349.25, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.4322670054278785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.00849994677992072, + "learning_rate": 5.16e-07, + "loss": 0.0003, + "num_tokens": 87121279.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 288.40625, + "completions/mean_terminated_length": 288.40625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.4323824922046426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.007894911352195777, + "learning_rate": 5.14e-07, + "loss": 0.0003, + "num_tokens": 87145580.0, + "reward": 3.9301462173461914, + "reward_std": 0.3951535224914551, + "rewards/reward_fn/mean": 3.9301462173461914, + "rewards/reward_fn/std": 0.3951534926891327, + "step": 3744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 343.625, + "completions/mean_terminated_length": 343.625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.43249797898140663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03662109375, + "kl": 0.006678491110506002, + "learning_rate": 5.12e-07, + "loss": 0.0003, + "num_tokens": 87171712.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 304.25, + "completions/mean_terminated_length": 304.25, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.43261346575817067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.008673515432747081, + "learning_rate": 5.1e-07, + "loss": 0.0003, + "num_tokens": 87195240.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 83.53125, + "completions/mean_terminated_length": 83.53125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.43272895253493476, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.03697377711068839, + "learning_rate": 5.079999999999999e-07, + "loss": 0.0015, + "num_tokens": 87220857.0, + "reward": 3.913879871368408, + "reward_std": 0.2774004638195038, + "rewards/reward_fn/mean": 3.913879871368408, + "rewards/reward_fn/std": 0.27740049362182617, + "step": 3747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 207.78125, + "completions/mean_terminated_length": 207.78125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.4328444393116988, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.012730193557217717, + "learning_rate": 5.06e-07, + "loss": 0.0005, + "num_tokens": 87249650.0, + "reward": 3.92846941947937, + "reward_std": 0.4046376645565033, + "rewards/reward_fn/mean": 3.92846941947937, + "rewards/reward_fn/std": 0.4046376347541809, + "step": 3748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 85.1875, + "completions/mean_terminated_length": 85.1875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.4329599260884629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.005905521480599418, + "learning_rate": 5.04e-07, + "loss": 0.0002, + "num_tokens": 87270360.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 85.0, + "completions/mean_terminated_length": 85.0, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.43307541286522694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.008989412475784775, + "learning_rate": 5.02e-07, + "loss": 0.0004, + "num_tokens": 87291544.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 258.21875, + "completions/mean_terminated_length": 258.21875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.433190899641991, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.011484369460958987, + "learning_rate": 5e-07, + "loss": 0.0005, + "num_tokens": 87317887.0, + "reward": 3.871060371398926, + "reward_std": 0.4494946300983429, + "rewards/reward_fn/mean": 3.871060371398926, + "rewards/reward_fn/std": 0.4494946300983429, + "step": 3751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 96.71875, + "completions/mean_terminated_length": 96.71875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.4333063864187551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042236328125, + "kl": 0.0036715981495945016, + "learning_rate": 4.979999999999999e-07, + "loss": 0.0001, + "num_tokens": 87339638.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 119.15625, + "completions/mean_terminated_length": 119.15625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.4334218731955191, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.0030696347948833136, + "learning_rate": 4.96e-07, + "loss": 0.0001, + "num_tokens": 87357339.0, + "reward": 3.964066743850708, + "reward_std": 0.2032688707113266, + "rewards/reward_fn/mean": 3.964066743850708, + "rewards/reward_fn/std": 0.2032688558101654, + "step": 3753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 240.0625, + "completions/mean_terminated_length": 240.0625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.43353735997228315, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.015059398065204732, + "learning_rate": 4.94e-07, + "loss": 0.0006, + "num_tokens": 87376061.0, + "reward": 3.0303096771240234, + "reward_std": 0.06038280576467514, + "rewards/reward_fn/mean": 3.0303096771240234, + "rewards/reward_fn/std": 0.060382768511772156, + "step": 3754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 117.6875, + "completions/mean_terminated_length": 117.6875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.43365284674904725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.007275563846633304, + "learning_rate": 4.92e-07, + "loss": 0.0003, + "num_tokens": 87397011.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 302.8125, + "completions/mean_terminated_length": 302.8125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.4337683335258113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.00907774834195152, + "learning_rate": 4.9e-07, + "loss": 0.0004, + "num_tokens": 87416365.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 101.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 75.625, + "completions/mean_terminated_length": 75.625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.43388382030257533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.006149053847366304, + "learning_rate": 4.879999999999999e-07, + "loss": 0.0002, + "num_tokens": 87435809.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 358.21875, + "completions/mean_terminated_length": 358.21875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.4339993070793394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031982421875, + "kl": 0.0072887592104962096, + "learning_rate": 4.86e-07, + "loss": 0.0003, + "num_tokens": 87457992.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 164.5625, + "completions/mean_terminated_length": 164.5625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.43411479385610346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.007348387880483642, + "learning_rate": 4.839999999999999e-07, + "loss": 0.0003, + "num_tokens": 87482522.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 114.03125, + "completions/mean_terminated_length": 114.03125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.43423028063286756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.006396545028110268, + "learning_rate": 4.82e-07, + "loss": 0.0003, + "num_tokens": 87505915.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 183.75, + "completions/mean_terminated_length": 183.75, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.4343457674096316, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0478515625, + "kl": 0.008099200851575006, + "learning_rate": 4.8e-07, + "loss": 0.0003, + "num_tokens": 87527987.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 133.15625, + "completions/mean_terminated_length": 133.15625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.43446125418639564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.008912982790207025, + "learning_rate": 4.779999999999999e-07, + "loss": 0.0004, + "num_tokens": 87552760.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 67.71875, + "completions/mean_terminated_length": 67.71875, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.43457674096315974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1826171875, + "kl": 0.011453378676378634, + "learning_rate": 4.76e-07, + "loss": 0.0005, + "num_tokens": 87576719.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 243.75, + "completions/mean_terminated_length": 243.75, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.4346922277399238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.008665910259878729, + "learning_rate": 4.7399999999999993e-07, + "loss": 0.0003, + "num_tokens": 87599591.0, + "reward": 3.9719467163085938, + "reward_std": 0.15869416296482086, + "rewards/reward_fn/mean": 3.9719467163085938, + "rewards/reward_fn/std": 0.15869417786598206, + "step": 3764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 155.625, + "completions/mean_terminated_length": 155.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.4348077145166878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.007742144378426019, + "learning_rate": 4.7199999999999994e-07, + "loss": 0.0003, + "num_tokens": 87627707.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 250.0, + "completions/mean_terminated_length": 250.0, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.4349232012934519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.015308555637602694, + "learning_rate": 4.6999999999999995e-07, + "loss": 0.0006, + "num_tokens": 87647227.0, + "reward": 3.0438010692596436, + "reward_std": 0.47873327136039734, + "rewards/reward_fn/mean": 3.0438010692596436, + "rewards/reward_fn/std": 0.47873321175575256, + "step": 3766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 249.625, + "completions/mean_terminated_length": 249.625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.43503868807021595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.01024534810858313, + "learning_rate": 4.68e-07, + "loss": 0.0004, + "num_tokens": 87667567.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 116.96875, + "completions/mean_terminated_length": 116.96875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.43515417484698005, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.008141674399666954, + "learning_rate": 4.66e-07, + "loss": 0.0003, + "num_tokens": 87687662.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 263.5, + "completions/mean_terminated_length": 263.5, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.4352696616237441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.00821471420204034, + "learning_rate": 4.64e-07, + "loss": 0.0003, + "num_tokens": 87712158.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 96.6875, + "completions/mean_terminated_length": 96.6875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.4353851484005081, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.006934193956112722, + "learning_rate": 4.62e-07, + "loss": 0.0003, + "num_tokens": 87727860.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 168.1875, + "completions/mean_terminated_length": 168.1875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.4355006351772722, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.02650043641915545, + "learning_rate": 4.6e-07, + "loss": 0.0011, + "num_tokens": 87755674.0, + "reward": 3.228733539581299, + "reward_std": 0.5713870525360107, + "rewards/reward_fn/mean": 3.228733539581299, + "rewards/reward_fn/std": 0.5713870525360107, + "step": 3771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 179.4375, + "completions/mean_terminated_length": 179.4375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.43561612195403626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.01305884411704028, + "learning_rate": 4.58e-07, + "loss": 0.0005, + "num_tokens": 87783656.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 212.59375, + "completions/mean_terminated_length": 212.59375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.4357316087308003, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.013322707178303972, + "learning_rate": 4.56e-07, + "loss": 0.0005, + "num_tokens": 87798523.0, + "reward": 3.4381511211395264, + "reward_std": 0.5828627943992615, + "rewards/reward_fn/mean": 3.4381511211395264, + "rewards/reward_fn/std": 0.5828627347946167, + "step": 3773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 165.15625, + "completions/mean_terminated_length": 165.15625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.4358470955075644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.011413132364396006, + "learning_rate": 4.54e-07, + "loss": 0.0005, + "num_tokens": 87814752.0, + "reward": 3.1994988918304443, + "reward_std": 0.03652293607592583, + "rewards/reward_fn/mean": 3.1994988918304443, + "rewards/reward_fn/std": 0.03652294725179672, + "step": 3774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 338.6875, + "completions/mean_terminated_length": 338.6875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.43596258228432844, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.010718259203713387, + "learning_rate": 4.5199999999999997e-07, + "loss": 0.0004, + "num_tokens": 87841654.0, + "reward": 3.928621768951416, + "reward_std": 0.40377652645111084, + "rewards/reward_fn/mean": 3.928621768951416, + "rewards/reward_fn/std": 0.40377649664878845, + "step": 3775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 191.0, + "completions/mean_terminated_length": 191.0, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.43607806906109253, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.01053763219533721, + "learning_rate": 4.5e-07, + "loss": 0.0004, + "num_tokens": 87872790.0, + "reward": 3.9100773334503174, + "reward_std": 0.39919227361679077, + "rewards/reward_fn/mean": 3.9100773334503174, + "rewards/reward_fn/std": 0.3991922438144684, + "step": 3776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 439.125, + "completions/mean_terminated_length": 439.125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.4361935558378566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.01128674803476315, + "learning_rate": 4.48e-07, + "loss": 0.0005, + "num_tokens": 87906138.0, + "reward": 3.422635555267334, + "reward_std": 0.7032244205474854, + "rewards/reward_fn/mean": 3.422635555267334, + "rewards/reward_fn/std": 0.7032243609428406, + "step": 3777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 113.84375, + "completions/mean_terminated_length": 113.84375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.4363090426146206, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.984375, + "kl": 0.015399294235976413, + "learning_rate": 4.46e-07, + "loss": 0.0006, + "num_tokens": 87937013.0, + "reward": 3.8030905723571777, + "reward_std": 0.3785586953163147, + "rewards/reward_fn/mean": 3.8030905723571777, + "rewards/reward_fn/std": 0.3785586655139923, + "step": 3778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 180.5, + "completions/mean_terminated_length": 180.5, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.4364245293913847, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.015761194415972568, + "learning_rate": 4.44e-07, + "loss": 0.0006, + "num_tokens": 87963845.0, + "reward": 3.9701590538024902, + "reward_std": 0.16880537569522858, + "rewards/reward_fn/mean": 3.9701590538024902, + "rewards/reward_fn/std": 0.16880542039871216, + "step": 3779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 102.40625, + "completions/mean_terminated_length": 102.40625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.43654001616814875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.008495503920130432, + "learning_rate": 4.4199999999999996e-07, + "loss": 0.0003, + "num_tokens": 87985170.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 105.59375, + "completions/mean_terminated_length": 105.59375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.4366555029449128, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.013576754397945479, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0005, + "num_tokens": 88012549.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 238.09375, + "completions/mean_terminated_length": 238.09375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.4367709897216769, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.010047841729829088, + "learning_rate": 4.38e-07, + "loss": 0.0004, + "num_tokens": 88031560.0, + "reward": 3.9313459396362305, + "reward_std": 0.3883662223815918, + "rewards/reward_fn/mean": 3.9313459396362305, + "rewards/reward_fn/std": 0.3883662819862366, + "step": 3782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 227.65625, + "completions/mean_terminated_length": 227.65625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.4368864764984409, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.019554069833247922, + "learning_rate": 4.36e-07, + "loss": 0.0008, + "num_tokens": 88061725.0, + "reward": 3.4079508781433105, + "reward_std": 0.4724227488040924, + "rewards/reward_fn/mean": 3.4079508781433105, + "rewards/reward_fn/std": 0.4724227488040924, + "step": 3783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.43700196327520496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.009396716119226767, + "learning_rate": 4.34e-07, + "loss": 0.0004, + "num_tokens": 88091601.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 88.59375, + "completions/mean_terminated_length": 88.59375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.43711745005196906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.007487453469366301, + "learning_rate": 4.3199999999999995e-07, + "loss": 0.0003, + "num_tokens": 88106468.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 189.59375, + "completions/mean_terminated_length": 189.59375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.4372329368287331, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.010195092014328111, + "learning_rate": 4.2999999999999996e-07, + "loss": 0.0004, + "num_tokens": 88125207.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 218.0625, + "completions/mean_terminated_length": 218.0625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.4373484236054972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.009763660884345882, + "learning_rate": 4.2799999999999997e-07, + "loss": 0.0004, + "num_tokens": 88153465.0, + "reward": 3.7631826400756836, + "reward_std": 0.46227124333381653, + "rewards/reward_fn/mean": 3.7631826400756836, + "rewards/reward_fn/std": 0.4622712731361389, + "step": 3787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 112.40625, + "completions/mean_terminated_length": 112.40625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.43746391038226123, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.007397506320558023, + "learning_rate": 4.26e-07, + "loss": 0.0003, + "num_tokens": 88168102.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 232.1875, + "completions/mean_terminated_length": 232.1875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.4375793971590253, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.012450980502762832, + "learning_rate": 4.24e-07, + "loss": 0.0005, + "num_tokens": 88188748.0, + "reward": 3.715816020965576, + "reward_std": 0.763926088809967, + "rewards/reward_fn/mean": 3.715816020965576, + "rewards/reward_fn/std": 0.7639260292053223, + "step": 3789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 78.78125, + "completions/mean_terminated_length": 78.78125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.43769488393578937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.008182322930224473, + "learning_rate": 4.2199999999999994e-07, + "loss": 0.0003, + "num_tokens": 88203333.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 233.84375, + "completions/mean_terminated_length": 233.84375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.4378103707125534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.008406701483181678, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0003, + "num_tokens": 88234272.0, + "reward": 3.270573139190674, + "reward_std": 0.4001694321632385, + "rewards/reward_fn/mean": 3.270573139190674, + "rewards/reward_fn/std": 0.40016937255859375, + "step": 3791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 89.0, + "completions/max_terminated_length": 89.0, + "completions/mean_length": 57.21875, + "completions/mean_terminated_length": 57.21875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.43792585748931745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16796875, + "kl": 0.013532419739931356, + "learning_rate": 4.1799999999999996e-07, + "loss": 0.0005, + "num_tokens": 88253959.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 79.5, + "completions/mean_terminated_length": 79.5, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.43804134426608154, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.59375, + "kl": 0.007933305888400355, + "learning_rate": 4.1599999999999997e-07, + "loss": 0.0003, + "num_tokens": 88272247.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 184.21875, + "completions/mean_terminated_length": 184.21875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.4381568310428456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040771484375, + "kl": 0.005385222342738416, + "learning_rate": 4.14e-07, + "loss": 0.0002, + "num_tokens": 88298622.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 88.03125, + "completions/mean_terminated_length": 88.03125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.4382723178196097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.0038722935987607343, + "learning_rate": 4.12e-07, + "loss": 0.0002, + "num_tokens": 88318719.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 364.40625, + "completions/mean_terminated_length": 364.40625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.4383878045963737, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.009124051604885608, + "learning_rate": 4.0999999999999994e-07, + "loss": 0.0004, + "num_tokens": 88348716.0, + "reward": 3.6132397651672363, + "reward_std": 0.6025550961494446, + "rewards/reward_fn/mean": 3.6132397651672363, + "rewards/reward_fn/std": 0.6025550365447998, + "step": 3796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 143.09375, + "completions/mean_terminated_length": 143.09375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.43850329137313776, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0419921875, + "kl": 0.005871857225429267, + "learning_rate": 4.0799999999999995e-07, + "loss": 0.0002, + "num_tokens": 88365455.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 69.0, + "completions/max_terminated_length": 69.0, + "completions/mean_length": 46.40625, + "completions/mean_terminated_length": 46.40625, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.43861877814990186, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.203125, + "kl": 0.02391944033661275, + "learning_rate": 4.06e-07, + "loss": 0.001, + "num_tokens": 88381788.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 182.1875, + "completions/mean_terminated_length": 182.1875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.4387342649266659, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.01780632535519544, + "learning_rate": 4.04e-07, + "loss": 0.0007, + "num_tokens": 88413026.0, + "reward": 3.973463773727417, + "reward_std": 0.15011131763458252, + "rewards/reward_fn/mean": 3.973463773727417, + "rewards/reward_fn/std": 0.15011130273342133, + "step": 3799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 196.84375, + "completions/mean_terminated_length": 196.84375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.43884975170342994, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.012910010744235478, + "learning_rate": 4.02e-07, + "loss": 0.0005, + "num_tokens": 88435101.0, + "reward": 3.947451114654541, + "reward_std": 0.20813725888729095, + "rewards/reward_fn/mean": 3.947451114654541, + "rewards/reward_fn/std": 0.20813724398612976, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 231.46875, + "completions/mean_terminated_length": 231.46875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.43896523848019403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.008798055088846013, + "learning_rate": 4e-07, + "loss": 0.0004, + "num_tokens": 88454060.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 152.375, + "completions/mean_terminated_length": 152.375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.43908072525695807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.011300469865091145, + "learning_rate": 3.98e-07, + "loss": 0.0005, + "num_tokens": 88481688.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 238.6875, + "completions/mean_terminated_length": 238.6875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.43919621203372217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.006370487652020529, + "learning_rate": 3.96e-07, + "loss": 0.0003, + "num_tokens": 88507534.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 80.0, + "completions/mean_terminated_length": 80.0, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.4393116988104862, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "kl": 0.006143269967651577, + "learning_rate": 3.94e-07, + "loss": 0.0002, + "num_tokens": 88523662.0, + "reward": 3.980050802230835, + "reward_std": 0.11284983903169632, + "rewards/reward_fn/mean": 3.980050802230835, + "rewards/reward_fn/std": 0.11284983158111572, + "step": 3804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 184.0, + "completions/mean_terminated_length": 184.0, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.43942718558725025, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.007457962034095544, + "learning_rate": 3.92e-07, + "loss": 0.0003, + "num_tokens": 88552526.0, + "reward": 2.9549310207366943, + "reward_std": 0.049286190420389175, + "rewards/reward_fn/mean": 2.9549310207366943, + "rewards/reward_fn/std": 0.04928618296980858, + "step": 3805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 233.5625, + "completions/mean_terminated_length": 233.5625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.43954267236401434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.013927505555329844, + "learning_rate": 3.8999999999999997e-07, + "loss": 0.0006, + "num_tokens": 88572704.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 157.09375, + "completions/mean_terminated_length": 157.09375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.4396581591407784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.008979986971098697, + "learning_rate": 3.88e-07, + "loss": 0.0004, + "num_tokens": 88594755.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 107.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 57.71875, + "completions/mean_terminated_length": 57.71875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.4397736459175424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.005518509475223254, + "learning_rate": 3.86e-07, + "loss": 0.0002, + "num_tokens": 88613210.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 307.5, + "completions/mean_terminated_length": 251.35482788085938, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.4398891326943065, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.01131569546123501, + "learning_rate": 3.84e-07, + "loss": 0.0005, + "num_tokens": 88652714.0, + "reward": 3.745352029800415, + "reward_std": 0.8222886323928833, + "rewards/reward_fn/mean": 3.745352029800415, + "rewards/reward_fn/std": 0.8222885727882385, + "step": 3809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 86.40625, + "completions/mean_terminated_length": 86.40625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.44000461947107056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.007633853645529598, + "learning_rate": 3.82e-07, + "loss": 0.0003, + "num_tokens": 88679831.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 105.53125, + "completions/mean_terminated_length": 105.53125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.4401201062478346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.004515238407293509, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0002, + "num_tokens": 88696136.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 200.78125, + "completions/mean_terminated_length": 200.78125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.4402355930245987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.007304538579774089, + "learning_rate": 3.7799999999999997e-07, + "loss": 0.0003, + "num_tokens": 88725025.0, + "reward": 3.8324437141418457, + "reward_std": 0.45921486616134644, + "rewards/reward_fn/mean": 3.8324437141418457, + "rewards/reward_fn/std": 0.45921480655670166, + "step": 3812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 151.96875, + "completions/mean_terminated_length": 151.96875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.44035107980136273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.006828965895692818, + "learning_rate": 3.76e-07, + "loss": 0.0003, + "num_tokens": 88747264.0, + "reward": 3.143404245376587, + "reward_std": 0.5404481291770935, + "rewards/reward_fn/mean": 3.143404245376587, + "rewards/reward_fn/std": 0.5404481887817383, + "step": 3813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 75.0625, + "completions/mean_terminated_length": 75.0625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.44046656657812683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.004221707533361041, + "learning_rate": 3.74e-07, + "loss": 0.0002, + "num_tokens": 88765442.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 70.125, + "completions/mean_terminated_length": 70.125, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.44058205335489087, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.005080334847662016, + "learning_rate": 3.72e-07, + "loss": 0.0002, + "num_tokens": 88778406.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 89.21875, + "completions/mean_terminated_length": 89.21875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.4406975401316549, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0, + "kl": 0.0087488634926558, + "learning_rate": 3.7e-07, + "loss": 0.0004, + "num_tokens": 88805773.0, + "reward": 3.9140214920043945, + "reward_std": 0.27240434288978577, + "rewards/reward_fn/mean": 3.9140214920043945, + "rewards/reward_fn/std": 0.2724043130874634, + "step": 3816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 133.25, + "completions/mean_terminated_length": 133.25, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.440813026908419, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.009492416007560678, + "learning_rate": 3.6799999999999996e-07, + "loss": 0.0004, + "num_tokens": 88835797.0, + "reward": 3.971904754638672, + "reward_std": 0.15893018245697021, + "rewards/reward_fn/mean": 3.971904754638672, + "rewards/reward_fn/std": 0.1589301973581314, + "step": 3817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 110.0, + "completions/max_terminated_length": 110.0, + "completions/mean_length": 82.8125, + "completions/mean_terminated_length": 82.8125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.44092851368518304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.00402356550875993, + "learning_rate": 3.6599999999999997e-07, + "loss": 0.0002, + "num_tokens": 88856751.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 218.125, + "completions/mean_terminated_length": 218.125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.4410440004619471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.00968378833931638, + "learning_rate": 3.64e-07, + "loss": 0.0004, + "num_tokens": 88881235.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 166.03125, + "completions/mean_terminated_length": 166.03125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.4411594872387112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.013438743160804734, + "learning_rate": 3.62e-07, + "loss": 0.0005, + "num_tokens": 88911988.0, + "reward": 3.988086223602295, + "reward_std": 0.06739456206560135, + "rewards/reward_fn/mean": 3.988086223602295, + "rewards/reward_fn/std": 0.06739457696676254, + "step": 3820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 80.0, + "completions/max_terminated_length": 80.0, + "completions/mean_length": 53.28125, + "completions/mean_terminated_length": 53.28125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.4412749740154752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.005338884842785774, + "learning_rate": 3.6e-07, + "loss": 0.0002, + "num_tokens": 88928157.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 249.84375, + "completions/mean_terminated_length": 249.84375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.4413904607922393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.009039615979418159, + "learning_rate": 3.5799999999999995e-07, + "loss": 0.0004, + "num_tokens": 88948312.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 179.78125, + "completions/mean_terminated_length": 179.78125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.44150594756900335, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.018077690227073617, + "learning_rate": 3.5599999999999996e-07, + "loss": 0.0007, + "num_tokens": 88964081.0, + "reward": 2.965651035308838, + "reward_std": 0.07535740733146667, + "rewards/reward_fn/mean": 2.965651035308838, + "rewards/reward_fn/std": 0.07535737752914429, + "step": 3823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 64.125, + "completions/mean_terminated_length": 64.125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.4416214343457674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.255859375, + "kl": 0.017502481015981175, + "learning_rate": 3.5399999999999997e-07, + "loss": 0.0007, + "num_tokens": 88978261.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 190.25, + "completions/mean_terminated_length": 190.25, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.4417369211225315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.007454817736288533, + "learning_rate": 3.52e-07, + "loss": 0.0003, + "num_tokens": 89007581.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 155.125, + "completions/mean_terminated_length": 155.125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.44185240789929553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.01048882509348914, + "learning_rate": 3.5e-07, + "loss": 0.0004, + "num_tokens": 89029025.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 90.03125, + "completions/mean_terminated_length": 90.03125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.44196789467605957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.007708612785791047, + "learning_rate": 3.4799999999999994e-07, + "loss": 0.0003, + "num_tokens": 89041666.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 86.90625, + "completions/mean_terminated_length": 86.90625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.44208338145282366, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.859375, + "kl": 0.006454002768805367, + "learning_rate": 3.4599999999999995e-07, + "loss": 0.0003, + "num_tokens": 89058367.0, + "reward": 3.9858343601226807, + "reward_std": 0.08013289421796799, + "rewards/reward_fn/mean": 3.9858343601226807, + "rewards/reward_fn/std": 0.0801328718662262, + "step": 3828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 90.21875, + "completions/mean_terminated_length": 90.21875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.4421988682295877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.007643400407687295, + "learning_rate": 3.4399999999999996e-07, + "loss": 0.0003, + "num_tokens": 89078854.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 501.34375, + "completions/mean_terminated_length": 501.34375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.4423143550063518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.01124716729100328, + "learning_rate": 3.42e-07, + "loss": 0.0004, + "num_tokens": 89110897.0, + "reward": 3.7141146659851074, + "reward_std": 0.7684869170188904, + "rewards/reward_fn/mean": 3.7141146659851074, + "rewards/reward_fn/std": 0.7684868574142456, + "step": 3830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 87.625, + "completions/mean_terminated_length": 87.625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.44242984178311584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.00308683239381935, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0001, + "num_tokens": 89139237.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 88.0, + "completions/max_terminated_length": 88.0, + "completions/mean_length": 61.125, + "completions/mean_terminated_length": 61.125, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.4425453285598799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.007233639353216859, + "learning_rate": 3.38e-07, + "loss": 0.0003, + "num_tokens": 89160809.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 82.65625, + "completions/mean_terminated_length": 82.65625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.442660815336644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.003949372490751557, + "learning_rate": 3.36e-07, + "loss": 0.0002, + "num_tokens": 89185950.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 440.28125, + "completions/mean_terminated_length": 440.28125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.442776302113408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.009429017583897803, + "learning_rate": 3.34e-07, + "loss": 0.0004, + "num_tokens": 89222087.0, + "reward": 3.3931331634521484, + "reward_std": 0.7272924780845642, + "rewards/reward_fn/mean": 3.3931331634521484, + "rewards/reward_fn/std": 0.7272924780845642, + "step": 3834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 299.46875, + "completions/mean_terminated_length": 299.46875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.44289178889017206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.01201584713999182, + "learning_rate": 3.32e-07, + "loss": 0.0005, + "num_tokens": 89252886.0, + "reward": 3.6430251598358154, + "reward_std": 0.5323553085327148, + "rewards/reward_fn/mean": 3.6430251598358154, + "rewards/reward_fn/std": 0.5323553085327148, + "step": 3835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 95.28125, + "completions/mean_terminated_length": 95.28125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.44300727566693615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.012618014465260785, + "learning_rate": 3.3e-07, + "loss": 0.0005, + "num_tokens": 89268287.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 116.6875, + "completions/mean_terminated_length": 116.6875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.4431227624437002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2294921875, + "kl": 0.01720364938955754, + "learning_rate": 3.28e-07, + "loss": 0.0007, + "num_tokens": 89287861.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 181.4375, + "completions/mean_terminated_length": 181.4375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.44323824922046423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.012315873696934432, + "learning_rate": 3.26e-07, + "loss": 0.0005, + "num_tokens": 89311747.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 72.875, + "completions/mean_terminated_length": 72.875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.4433537359972283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.004347036765466328, + "learning_rate": 3.24e-07, + "loss": 0.0002, + "num_tokens": 89337375.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 213.15625, + "completions/mean_terminated_length": 213.15625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.44346922277399237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.007995137057150714, + "learning_rate": 3.22e-07, + "loss": 0.0003, + "num_tokens": 89373284.0, + "reward": 3.9674551486968994, + "reward_std": 0.1841016411781311, + "rewards/reward_fn/mean": 3.9674551486968994, + "rewards/reward_fn/std": 0.1841016560792923, + "step": 3840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 79.21875, + "completions/mean_terminated_length": 79.21875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.44358470955075646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.0037661147207472823, + "learning_rate": 3.2e-07, + "loss": 0.0002, + "num_tokens": 89402507.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 127.375, + "completions/mean_terminated_length": 127.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.4437001963275205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.005799785094495746, + "learning_rate": 3.18e-07, + "loss": 0.0002, + "num_tokens": 89429431.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 218.875, + "completions/mean_terminated_length": 218.875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.44381568310428454, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.014801856828853488, + "learning_rate": 3.1599999999999997e-07, + "loss": 0.0006, + "num_tokens": 89461523.0, + "reward": 3.9772837162017822, + "reward_std": 0.1285029649734497, + "rewards/reward_fn/mean": 3.9772837162017822, + "rewards/reward_fn/std": 0.1285029649734497, + "step": 3843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 166.75, + "completions/mean_terminated_length": 166.75, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.44393116988104864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.005616798378468957, + "learning_rate": 3.14e-07, + "loss": 0.0002, + "num_tokens": 89487563.0, + "reward": 3.9441399574279785, + "reward_std": 0.219863161444664, + "rewards/reward_fn/mean": 3.9441399574279785, + "rewards/reward_fn/std": 0.2198631465435028, + "step": 3844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.4440466566578127, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.0067485765321180224, + "learning_rate": 3.12e-07, + "loss": 0.0003, + "num_tokens": 89510975.0, + "reward": 3.851992607116699, + "reward_std": 0.2083798199892044, + "rewards/reward_fn/mean": 3.851992607116699, + "rewards/reward_fn/std": 0.20837979018688202, + "step": 3845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 145.28125, + "completions/mean_terminated_length": 145.28125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.4441621434345767, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.012748409433697816, + "learning_rate": 3.1e-07, + "loss": 0.0005, + "num_tokens": 89536360.0, + "reward": 3.9365878105163574, + "reward_std": 0.2555597126483917, + "rewards/reward_fn/mean": 3.9365878105163574, + "rewards/reward_fn/std": 0.2555597424507141, + "step": 3846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 72.40625, + "completions/mean_terminated_length": 72.40625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.4442776302113408, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.234375, + "kl": 0.017950927358469926, + "learning_rate": 3.08e-07, + "loss": 0.0007, + "num_tokens": 89547285.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 103.84375, + "completions/mean_terminated_length": 103.84375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.44439311698810485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.010431157861603424, + "learning_rate": 3.0599999999999996e-07, + "loss": 0.0004, + "num_tokens": 89565552.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 283.03125, + "completions/mean_terminated_length": 283.03125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.44450860376486895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.009408409969182685, + "learning_rate": 3.0399999999999997e-07, + "loss": 0.0004, + "num_tokens": 89588369.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 244.0, + "completions/mean_terminated_length": 244.0, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.444624090541633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.009938077702827286, + "learning_rate": 3.02e-07, + "loss": 0.0004, + "num_tokens": 89610513.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 106.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 76.9375, + "completions/mean_terminated_length": 76.9375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.44473957731839703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.005968499470327515, + "learning_rate": 3e-07, + "loss": 0.0002, + "num_tokens": 89638703.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 94.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 72.03125, + "completions/mean_terminated_length": 72.03125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.4448550640951611, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1630859375, + "kl": 0.014997827965999022, + "learning_rate": 2.98e-07, + "loss": 0.0006, + "num_tokens": 89659408.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 142.71875, + "completions/mean_terminated_length": 142.71875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.44497055087192516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.010878928689635359, + "learning_rate": 2.9599999999999995e-07, + "loss": 0.0004, + "num_tokens": 89678791.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 212.25, + "completions/mean_terminated_length": 212.25, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.4450860376486892, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.014439622085774317, + "learning_rate": 2.9399999999999996e-07, + "loss": 0.0006, + "num_tokens": 89711183.0, + "reward": 3.926546335220337, + "reward_std": 0.23809655010700226, + "rewards/reward_fn/mean": 3.926546335220337, + "rewards/reward_fn/std": 0.23809653520584106, + "step": 3854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 205.28125, + "completions/mean_terminated_length": 205.28125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.4452015244254533, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.012469511304516345, + "learning_rate": 2.9199999999999997e-07, + "loss": 0.0005, + "num_tokens": 89729592.0, + "reward": 3.8960485458374023, + "reward_std": 0.43818211555480957, + "rewards/reward_fn/mean": 3.8960485458374023, + "rewards/reward_fn/std": 0.43818214535713196, + "step": 3855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 291.8125, + "completions/mean_terminated_length": 291.8125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.44531701120221734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.007913586610811763, + "learning_rate": 2.9e-07, + "loss": 0.0003, + "num_tokens": 89757874.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 72.0, + "completions/max_terminated_length": 72.0, + "completions/mean_length": 50.28125, + "completions/mean_terminated_length": 50.28125, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.44543249797898143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.006419352939701639, + "learning_rate": 2.88e-07, + "loss": 0.0003, + "num_tokens": 89786939.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 384.4375, + "completions/mean_terminated_length": 384.4375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.4455479847557455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.011094405766925775, + "learning_rate": 2.8599999999999994e-07, + "loss": 0.0004, + "num_tokens": 89824073.0, + "reward": 3.278907060623169, + "reward_std": 0.6716399192810059, + "rewards/reward_fn/mean": 3.278907060623169, + "rewards/reward_fn/std": 0.6716399192810059, + "step": 3858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 263.90625, + "completions/mean_terminated_length": 263.90625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.4456634715325095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.009870000372757204, + "learning_rate": 2.8399999999999995e-07, + "loss": 0.0004, + "num_tokens": 89844006.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 190.4375, + "completions/mean_terminated_length": 190.4375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.4457789583092736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.011282573348580627, + "learning_rate": 2.8199999999999996e-07, + "loss": 0.0005, + "num_tokens": 89874292.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 318.1875, + "completions/mean_terminated_length": 318.1875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.44589444508603765, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.008490615473419894, + "learning_rate": 2.8e-07, + "loss": 0.0003, + "num_tokens": 89899546.0, + "reward": 3.9308152198791504, + "reward_std": 0.39136770367622375, + "rewards/reward_fn/mean": 3.9308152198791504, + "rewards/reward_fn/std": 0.39136770367622375, + "step": 3861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 136.46875, + "completions/mean_terminated_length": 136.46875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.4460099318628017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.0065011775950551964, + "learning_rate": 2.7800000000000003e-07, + "loss": 0.0003, + "num_tokens": 89935305.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 165.34375, + "completions/mean_terminated_length": 165.34375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.4461254186395658, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.234375, + "kl": 0.009325299171905499, + "learning_rate": 2.7600000000000004e-07, + "loss": 0.0004, + "num_tokens": 89968756.0, + "reward": 3.9326109886169434, + "reward_std": 0.3812098801136017, + "rewards/reward_fn/mean": 3.9326109886169434, + "rewards/reward_fn/std": 0.3812099099159241, + "step": 3863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 324.96875, + "completions/mean_terminated_length": 324.96875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.4462409054163298, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.011152110528200865, + "learning_rate": 2.74e-07, + "loss": 0.0004, + "num_tokens": 89991347.0, + "reward": 3.853511333465576, + "reward_std": 0.5764484405517578, + "rewards/reward_fn/mean": 3.853511333465576, + "rewards/reward_fn/std": 0.5764485001564026, + "step": 3864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 386.3125, + "completions/mean_terminated_length": 386.3125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.44635639219309386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.011409395883674733, + "learning_rate": 2.72e-07, + "loss": 0.0005, + "num_tokens": 90027421.0, + "reward": 2.6390082836151123, + "reward_std": 0.5084059238433838, + "rewards/reward_fn/mean": 2.6390082836151123, + "rewards/reward_fn/std": 0.508405864238739, + "step": 3865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 153.6875, + "completions/mean_terminated_length": 153.6875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.44647187896985796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.005223295294854324, + "learning_rate": 2.7e-07, + "loss": 0.0002, + "num_tokens": 90046227.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 282.9375, + "completions/mean_terminated_length": 282.9375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.446587365746622, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.013305961081641726, + "learning_rate": 2.68e-07, + "loss": 0.0005, + "num_tokens": 90084785.0, + "reward": 3.5945539474487305, + "reward_std": 0.5260501503944397, + "rewards/reward_fn/mean": 3.5945539474487305, + "rewards/reward_fn/std": 0.5260501503944397, + "step": 3867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 221.0, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.4467028525233861, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.01583972532534972, + "learning_rate": 2.66e-07, + "loss": 0.0006, + "num_tokens": 90117233.0, + "reward": 3.731640338897705, + "reward_std": 0.3145281970500946, + "rewards/reward_fn/mean": 3.731640338897705, + "rewards/reward_fn/std": 0.3145281970500946, + "step": 3868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 329.15625, + "completions/mean_terminated_length": 329.15625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.44681833930015014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.007299988188606221, + "learning_rate": 2.64e-07, + "loss": 0.0003, + "num_tokens": 90150454.0, + "reward": 3.5727717876434326, + "reward_std": 0.8063051104545593, + "rewards/reward_fn/mean": 3.5727717876434326, + "rewards/reward_fn/std": 0.8063050508499146, + "step": 3869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 190.125, + "completions/mean_terminated_length": 190.125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.4469338260769142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.006382723709975835, + "learning_rate": 2.62e-07, + "loss": 0.0003, + "num_tokens": 90169178.0, + "reward": 3.4312217235565186, + "reward_std": 0.03250185400247574, + "rewards/reward_fn/mean": 3.4312217235565186, + "rewards/reward_fn/std": 0.032501835376024246, + "step": 3870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 141.4375, + "completions/mean_terminated_length": 141.4375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.44704931285367827, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.010796608548844233, + "learning_rate": 2.6e-07, + "loss": 0.0004, + "num_tokens": 90202568.0, + "reward": 3.7322750091552734, + "reward_std": 0.6248207688331604, + "rewards/reward_fn/mean": 3.7322750091552734, + "rewards/reward_fn/std": 0.6248207688331604, + "step": 3871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 208.125, + "completions/mean_terminated_length": 208.125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.4471647996304423, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.01112669799476862, + "learning_rate": 2.58e-07, + "loss": 0.0004, + "num_tokens": 90230572.0, + "reward": 3.1797657012939453, + "reward_std": 0.13682673871517181, + "rewards/reward_fn/mean": 3.1797657012939453, + "rewards/reward_fn/std": 0.136826753616333, + "step": 3872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 127.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 85.25, + "completions/mean_terminated_length": 85.25, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.44728028640720635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.004201254600047832, + "learning_rate": 2.56e-07, + "loss": 0.0002, + "num_tokens": 90245332.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 102.0, + "completions/max_terminated_length": 102.0, + "completions/mean_length": 70.9375, + "completions/mean_terminated_length": 70.9375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.44739577318397045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.008975925738923252, + "learning_rate": 2.5399999999999997e-07, + "loss": 0.0004, + "num_tokens": 90270034.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 317.6875, + "completions/mean_terminated_length": 317.6875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.4475112599607345, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.011185320909135044, + "learning_rate": 2.52e-07, + "loss": 0.0004, + "num_tokens": 90291752.0, + "reward": 3.510223388671875, + "reward_std": 0.9405273795127869, + "rewards/reward_fn/mean": 3.510223388671875, + "rewards/reward_fn/std": 0.9405273795127869, + "step": 3875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 127.3125, + "completions/mean_terminated_length": 127.3125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.4476267467374986, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.009946472622686997, + "learning_rate": 2.5e-07, + "loss": 0.0004, + "num_tokens": 90305202.0, + "reward": 2.771894693374634, + "reward_std": 0.03418605402112007, + "rewards/reward_fn/mean": 2.771894693374634, + "rewards/reward_fn/std": 0.03418607637286186, + "step": 3876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 83.875, + "completions/mean_terminated_length": 83.875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4477422335142626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.006446522807891597, + "learning_rate": 2.48e-07, + "loss": 0.0003, + "num_tokens": 90331758.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 298.84375, + "completions/mean_terminated_length": 298.84375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.44785772029102666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.010767984480480663, + "learning_rate": 2.46e-07, + "loss": 0.0004, + "num_tokens": 90353897.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 80.6875, + "completions/mean_terminated_length": 80.6875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.44797320706779076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.171875, + "kl": 0.010479373435373418, + "learning_rate": 2.4399999999999996e-07, + "loss": 0.0004, + "num_tokens": 90375551.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 57.96875, + "completions/mean_terminated_length": 57.96875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.4480886938445548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.185546875, + "kl": 0.013024665138800628, + "learning_rate": 2.4199999999999997e-07, + "loss": 0.0005, + "num_tokens": 90388894.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 139.90625, + "completions/mean_terminated_length": 139.90625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.44820418062131884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.007645757534191944, + "learning_rate": 2.4e-07, + "loss": 0.0003, + "num_tokens": 90405371.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 96.46875, + "completions/mean_terminated_length": 96.46875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.44831966739808293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25390625, + "kl": 0.012037511260132305, + "learning_rate": 2.38e-07, + "loss": 0.0005, + "num_tokens": 90429386.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 151.1875, + "completions/mean_terminated_length": 90.0, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.448435154174847, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.52734375, + "kl": 0.007148218701331643, + "learning_rate": 2.3599999999999997e-07, + "loss": 0.0003, + "num_tokens": 90453264.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 173.0625, + "completions/mean_terminated_length": 173.0625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.44855064095161107, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.009385602774273138, + "learning_rate": 2.34e-07, + "loss": 0.0004, + "num_tokens": 90474994.0, + "reward": 3.8620409965515137, + "reward_std": 0.3716421127319336, + "rewards/reward_fn/mean": 3.8620409965515137, + "rewards/reward_fn/std": 0.3716421127319336, + "step": 3884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 82.28125, + "completions/mean_terminated_length": 82.28125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4486661277283751, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.00457176289819472, + "learning_rate": 2.32e-07, + "loss": 0.0002, + "num_tokens": 90500443.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 88.0, + "completions/mean_terminated_length": 88.0, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.44878161450513915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.005366023331589531, + "learning_rate": 2.3e-07, + "loss": 0.0002, + "num_tokens": 90521691.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 94.8125, + "completions/mean_terminated_length": 94.8125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.44889710128190324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.009042597848747391, + "learning_rate": 2.28e-07, + "loss": 0.0004, + "num_tokens": 90543157.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 121.03125, + "completions/mean_terminated_length": 121.03125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.4490125880586673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.014042303067981265, + "learning_rate": 2.2599999999999999e-07, + "loss": 0.0006, + "num_tokens": 90572822.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 187.15625, + "completions/mean_terminated_length": 187.15625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.4491280748354313, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.008812187479634304, + "learning_rate": 2.24e-07, + "loss": 0.0004, + "num_tokens": 90593851.0, + "reward": 2.749628782272339, + "reward_std": 0.04501626640558243, + "rewards/reward_fn/mean": 2.749628782272339, + "rewards/reward_fn/std": 0.045016270130872726, + "step": 3889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 105.1875, + "completions/mean_terminated_length": 105.1875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.4492435616121954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.25, + "kl": 0.0052653878592536785, + "learning_rate": 2.22e-07, + "loss": 0.0002, + "num_tokens": 90609857.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 95.9375, + "completions/mean_terminated_length": 95.9375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.44935904838895946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.008599287986726267, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0003, + "num_tokens": 90635423.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 228.40625, + "completions/mean_terminated_length": 228.40625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.4494745351657235, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.011623006823356263, + "learning_rate": 2.18e-07, + "loss": 0.0005, + "num_tokens": 90665260.0, + "reward": 3.6394705772399902, + "reward_std": 0.592107892036438, + "rewards/reward_fn/mean": 3.6394705772399902, + "rewards/reward_fn/std": 0.5921079516410828, + "step": 3892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 226.90625, + "completions/mean_terminated_length": 226.90625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.4495900219424876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.010169049812247977, + "learning_rate": 2.1599999999999998e-07, + "loss": 0.0004, + "num_tokens": 90699177.0, + "reward": 3.2912392616271973, + "reward_std": 0.3286775052547455, + "rewards/reward_fn/mean": 3.2912392616271973, + "rewards/reward_fn/std": 0.3286775052547455, + "step": 3893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 273.59375, + "completions/mean_terminated_length": 273.59375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.44970550871925163, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.012374088750220835, + "learning_rate": 2.1399999999999998e-07, + "loss": 0.0005, + "num_tokens": 90726684.0, + "reward": 3.5805258750915527, + "reward_std": 0.8563888072967529, + "rewards/reward_fn/mean": 3.5805258750915527, + "rewards/reward_fn/std": 0.8563887476921082, + "step": 3894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 64.75, + "completions/mean_terminated_length": 64.75, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.44982099549601573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.005175929894903675, + "learning_rate": 2.12e-07, + "loss": 0.0002, + "num_tokens": 90740116.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 216.1875, + "completions/mean_terminated_length": 216.1875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.44993648227277977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.009597462856618222, + "learning_rate": 2.0999999999999997e-07, + "loss": 0.0004, + "num_tokens": 90766394.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 75.5, + "completions/mean_terminated_length": 75.5, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.4500519690495438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.006291487195994705, + "learning_rate": 2.0799999999999998e-07, + "loss": 0.0003, + "num_tokens": 90781290.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 431.46875, + "completions/mean_terminated_length": 379.32257080078125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.4501674558263079, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.01365497487859102, + "learning_rate": 2.06e-07, + "loss": 0.0005, + "num_tokens": 90813529.0, + "reward": 2.5944180488586426, + "reward_std": 0.790472149848938, + "rewards/reward_fn/mean": 2.5944180488586426, + "rewards/reward_fn/std": 0.7904722094535828, + "step": 3898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 210.59375, + "completions/mean_terminated_length": 210.59375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.45028294260307195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.015472380648134276, + "learning_rate": 2.0399999999999997e-07, + "loss": 0.0006, + "num_tokens": 90846284.0, + "reward": 3.5338146686553955, + "reward_std": 0.4808200001716614, + "rewards/reward_fn/mean": 3.5338146686553955, + "rewards/reward_fn/std": 0.48082005977630615, + "step": 3899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1272.0, + "completions/max_terminated_length": 1272.0, + "completions/mean_length": 319.90625, + "completions/mean_terminated_length": 319.90625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.450398429379836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.011460388996056281, + "learning_rate": 2.02e-07, + "loss": 0.0005, + "num_tokens": 90879209.0, + "reward": 3.573458433151245, + "reward_std": 0.6851809620857239, + "rewards/reward_fn/mean": 3.573458433151245, + "rewards/reward_fn/std": 0.6851810216903687, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 141.96875, + "completions/mean_terminated_length": 141.96875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.4505139161566001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.010428824200062081, + "learning_rate": 2e-07, + "loss": 0.0004, + "num_tokens": 90892744.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 86.78125, + "completions/mean_terminated_length": 86.78125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.4506294029333641, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.00754408148713992, + "learning_rate": 1.98e-07, + "loss": 0.0003, + "num_tokens": 90908225.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 202.65625, + "completions/mean_terminated_length": 202.65625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.4507448897101282, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.007670534905628301, + "learning_rate": 1.96e-07, + "loss": 0.0003, + "num_tokens": 90933878.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 179.8125, + "completions/mean_terminated_length": 179.8125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.45086037648689226, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.013215631886851043, + "learning_rate": 1.94e-07, + "loss": 0.0005, + "num_tokens": 90964176.0, + "reward": 3.8236846923828125, + "reward_std": 0.46790575981140137, + "rewards/reward_fn/mean": 3.8236846923828125, + "rewards/reward_fn/std": 0.467905730009079, + "step": 3904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 176.09375, + "completions/mean_terminated_length": 176.09375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.4509758632636563, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.013831748248776421, + "learning_rate": 1.92e-07, + "loss": 0.0006, + "num_tokens": 90982515.0, + "reward": 3.965923309326172, + "reward_std": 0.19276748597621918, + "rewards/reward_fn/mean": 3.965923309326172, + "rewards/reward_fn/std": 0.1927674561738968, + "step": 3905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 216.21875, + "completions/mean_terminated_length": 216.21875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.4510913500404204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.010350926459068432, + "learning_rate": 1.8999999999999998e-07, + "loss": 0.0004, + "num_tokens": 91008826.0, + "reward": 2.8994579315185547, + "reward_std": 0.3505261242389679, + "rewards/reward_fn/mean": 2.8994579315185547, + "rewards/reward_fn/std": 0.3505261242389679, + "step": 3906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 219.21875, + "completions/mean_terminated_length": 219.21875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.45120683681718443, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.01168530659924727, + "learning_rate": 1.88e-07, + "loss": 0.0005, + "num_tokens": 91038241.0, + "reward": 3.3592135906219482, + "reward_std": 0.512951672077179, + "rewards/reward_fn/mean": 3.3592135906219482, + "rewards/reward_fn/std": 0.512951672077179, + "step": 3907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 123.53125, + "completions/mean_terminated_length": 123.53125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.45132232359394847, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.00850895575422328, + "learning_rate": 1.86e-07, + "loss": 0.0003, + "num_tokens": 91063186.0, + "reward": 3.928398609161377, + "reward_std": 0.4050392806529999, + "rewards/reward_fn/mean": 3.928398609161377, + "rewards/reward_fn/std": 0.4050392508506775, + "step": 3908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 238.5625, + "completions/mean_terminated_length": 238.5625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.45143781037071257, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.010279519308824092, + "learning_rate": 1.8399999999999998e-07, + "loss": 0.0004, + "num_tokens": 91089124.0, + "reward": 3.9698808193206787, + "reward_std": 0.17037975788116455, + "rewards/reward_fn/mean": 3.9698808193206787, + "rewards/reward_fn/std": 0.17037972807884216, + "step": 3909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 130.96875, + "completions/mean_terminated_length": 130.96875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.4515532971474766, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.014276120928116143, + "learning_rate": 1.82e-07, + "loss": 0.0006, + "num_tokens": 91101347.0, + "reward": 3.932380437850952, + "reward_std": 0.2660925090312958, + "rewards/reward_fn/mean": 3.932380437850952, + "rewards/reward_fn/std": 0.266092449426651, + "step": 3910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 95.59375, + "completions/mean_terminated_length": 95.59375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.4516687839242407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.01663165858190041, + "learning_rate": 1.8e-07, + "loss": 0.0007, + "num_tokens": 91119222.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 291.875, + "completions/mean_terminated_length": 291.875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.45178427070100474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.011066817620303482, + "learning_rate": 1.7799999999999998e-07, + "loss": 0.0004, + "num_tokens": 91140466.0, + "reward": 3.5216851234436035, + "reward_std": 0.9185503125190735, + "rewards/reward_fn/mean": 3.5216851234436035, + "rewards/reward_fn/std": 0.9185503721237183, + "step": 3912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 155.75, + "completions/mean_terminated_length": 155.75, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.4518997574777688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041748046875, + "kl": 0.006050102580047678, + "learning_rate": 1.76e-07, + "loss": 0.0002, + "num_tokens": 91155146.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 158.875, + "completions/mean_terminated_length": 158.875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.4520152442545329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.009060990167199634, + "learning_rate": 1.7399999999999997e-07, + "loss": 0.0004, + "num_tokens": 91169190.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 221.5, + "completions/mean_terminated_length": 221.5, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.4521307310312969, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.024202953893109225, + "learning_rate": 1.7199999999999998e-07, + "loss": 0.001, + "num_tokens": 91202198.0, + "reward": 3.9318573474884033, + "reward_std": 0.3854726552963257, + "rewards/reward_fn/mean": 3.9318573474884033, + "rewards/reward_fn/std": 0.3854726552963257, + "step": 3915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 216.25, + "completions/mean_terminated_length": 216.25, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.45224621780806096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.008030492681427859, + "learning_rate": 1.7000000000000001e-07, + "loss": 0.0003, + "num_tokens": 91224254.0, + "reward": 3.8959531784057617, + "reward_std": 0.42777907848358154, + "rewards/reward_fn/mean": 3.8959531784057617, + "rewards/reward_fn/std": 0.42777910828590393, + "step": 3916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 328.75, + "completions/mean_terminated_length": 328.75, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.45236170458482505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.009613201014872175, + "learning_rate": 1.68e-07, + "loss": 0.0004, + "num_tokens": 91249942.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1562.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 469.125, + "completions/mean_terminated_length": 469.125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.4524771913615891, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.009147611162916292, + "learning_rate": 1.66e-07, + "loss": 0.0004, + "num_tokens": 91281754.0, + "reward": 3.8313522338867188, + "reward_std": 0.4965180456638336, + "rewards/reward_fn/mean": 3.8313522338867188, + "rewards/reward_fn/std": 0.4965181052684784, + "step": 3918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 97.28125, + "completions/mean_terminated_length": 97.28125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.45259267813835313, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.546875, + "kl": 0.010292491249856539, + "learning_rate": 1.64e-07, + "loss": 0.0004, + "num_tokens": 91302659.0, + "reward": 3.974482536315918, + "reward_std": 0.14434871077537537, + "rewards/reward_fn/mean": 3.974482536315918, + "rewards/reward_fn/std": 0.14434869587421417, + "step": 3919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 168.78125, + "completions/mean_terminated_length": 168.78125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.45270816491511723, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.859375, + "kl": 0.008659503233502619, + "learning_rate": 1.62e-07, + "loss": 0.0003, + "num_tokens": 91322300.0, + "reward": 3.521808385848999, + "reward_std": 0.12922117114067078, + "rewards/reward_fn/mean": 3.521808385848999, + "rewards/reward_fn/std": 0.1292211413383484, + "step": 3920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 119.875, + "completions/mean_terminated_length": 119.875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.45282365169188127, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.00958455607178621, + "learning_rate": 1.6e-07, + "loss": 0.0004, + "num_tokens": 91344440.0, + "reward": 3.637910842895508, + "reward_std": 0.09594398736953735, + "rewards/reward_fn/mean": 3.637910842895508, + "rewards/reward_fn/std": 0.09594398736953735, + "step": 3921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 177.875, + "completions/mean_terminated_length": 177.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.45293913846864536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.017018017526424956, + "learning_rate": 1.5799999999999999e-07, + "loss": 0.0007, + "num_tokens": 91362164.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 140.40625, + "completions/mean_terminated_length": 140.40625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.4530546252454094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.005344037803297397, + "learning_rate": 1.56e-07, + "loss": 0.0002, + "num_tokens": 91384353.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 177.90625, + "completions/mean_terminated_length": 177.90625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.45317011202217344, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.02126458106795326, + "learning_rate": 1.54e-07, + "loss": 0.0009, + "num_tokens": 91412318.0, + "reward": 3.040347099304199, + "reward_std": 0.6113703846931458, + "rewards/reward_fn/mean": 3.040347099304199, + "rewards/reward_fn/std": 0.6113703846931458, + "step": 3924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 328.71875, + "completions/mean_terminated_length": 328.71875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.45328559879893754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.009537824618746527, + "learning_rate": 1.5199999999999998e-07, + "loss": 0.0004, + "num_tokens": 91437845.0, + "reward": 3.724750518798828, + "reward_std": 0.739971399307251, + "rewards/reward_fn/mean": 3.724750518798828, + "rewards/reward_fn/std": 0.7399713397026062, + "step": 3925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 216.875, + "completions/mean_terminated_length": 216.875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.4534010855757016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.015745105760288425, + "learning_rate": 1.5e-07, + "loss": 0.0006, + "num_tokens": 91462801.0, + "reward": 3.706561803817749, + "reward_std": 0.5310116410255432, + "rewards/reward_fn/mean": 3.706561803817749, + "rewards/reward_fn/std": 0.5310116410255432, + "step": 3926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 87.90625, + "completions/mean_terminated_length": 87.90625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.4535165723524656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.006536725089972606, + "learning_rate": 1.4799999999999998e-07, + "loss": 0.0003, + "num_tokens": 91477742.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 92.53125, + "completions/mean_terminated_length": 92.53125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.4536320591292297, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.0068383229699975345, + "learning_rate": 1.4599999999999998e-07, + "loss": 0.0003, + "num_tokens": 91497151.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 110.21875, + "completions/mean_terminated_length": 110.21875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.45374754590599375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.0037985000617482, + "learning_rate": 1.44e-07, + "loss": 0.0002, + "num_tokens": 91512998.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.0, + "completions/max_terminated_length": 100.0, + "completions/mean_length": 63.28125, + "completions/mean_terminated_length": 63.28125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.45386303268275785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.004208719374219072, + "learning_rate": 1.4199999999999997e-07, + "loss": 0.0002, + "num_tokens": 91530799.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 189.71875, + "completions/mean_terminated_length": 189.71875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.4539785194595219, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.018525464547565207, + "learning_rate": 1.4e-07, + "loss": 0.0007, + "num_tokens": 91566278.0, + "reward": 3.9669113159179688, + "reward_std": 0.18717801570892334, + "rewards/reward_fn/mean": 3.9669113159179688, + "rewards/reward_fn/std": 0.18717803061008453, + "step": 3931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 111.09375, + "completions/mean_terminated_length": 111.09375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.45409400623628593, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.0055504500087408815, + "learning_rate": 1.3800000000000002e-07, + "loss": 0.0002, + "num_tokens": 91592233.0, + "reward": 3.9836087226867676, + "reward_std": 0.09272392094135284, + "rewards/reward_fn/mean": 3.9836087226867676, + "rewards/reward_fn/std": 0.09272395074367523, + "step": 3932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 340.5, + "completions/mean_terminated_length": 340.5, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.45420949301305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.009267398898373358, + "learning_rate": 1.36e-07, + "loss": 0.0004, + "num_tokens": 91615449.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 92.96875, + "completions/mean_terminated_length": 92.96875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.45432497978981407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.0017827329438659945, + "learning_rate": 1.34e-07, + "loss": 0.0001, + "num_tokens": 91629816.0, + "reward": 3.9863834381103516, + "reward_std": 0.07702664285898209, + "rewards/reward_fn/mean": 3.9863834381103516, + "rewards/reward_fn/std": 0.07702665776014328, + "step": 3934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 168.4375, + "completions/mean_terminated_length": 168.4375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.4544404665665781, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.009937977578374557, + "learning_rate": 1.32e-07, + "loss": 0.0004, + "num_tokens": 91653478.0, + "reward": 3.9344964027404785, + "reward_std": 0.25784748792648315, + "rewards/reward_fn/mean": 3.9344964027404785, + "rewards/reward_fn/std": 0.25784745812416077, + "step": 3935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 192.125, + "completions/mean_terminated_length": 192.125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.4545559533433422, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.018594935230794363, + "learning_rate": 1.3e-07, + "loss": 0.0007, + "num_tokens": 91671402.0, + "reward": 3.7567498683929443, + "reward_std": 0.3973569869995117, + "rewards/reward_fn/mean": 3.7567498683929443, + "rewards/reward_fn/std": 0.3973569869995117, + "step": 3936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 115.34375, + "completions/mean_terminated_length": 115.34375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.45467144012010624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.010941008389636409, + "learning_rate": 1.28e-07, + "loss": 0.0004, + "num_tokens": 91693301.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 302.5, + "completions/mean_terminated_length": 302.5, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.45478692689687034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.0075580564080155455, + "learning_rate": 1.26e-07, + "loss": 0.0003, + "num_tokens": 91717989.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 241.375, + "completions/mean_terminated_length": 241.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.4549024136736344, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.017012122625601478, + "learning_rate": 1.24e-07, + "loss": 0.0007, + "num_tokens": 91742961.0, + "reward": 3.5100951194763184, + "reward_std": 0.5551971793174744, + "rewards/reward_fn/mean": 3.5100951194763184, + "rewards/reward_fn/std": 0.5551971793174744, + "step": 3939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 105.09375, + "completions/mean_terminated_length": 105.09375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.4550179004503984, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.28125, + "kl": 0.008239011949626729, + "learning_rate": 1.2199999999999998e-07, + "loss": 0.0003, + "num_tokens": 91758516.0, + "reward": 3.2452151775360107, + "reward_std": 0.5552629232406616, + "rewards/reward_fn/mean": 3.2452151775360107, + "rewards/reward_fn/std": 0.5552628636360168, + "step": 3940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 239.71875, + "completions/mean_terminated_length": 239.71875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.4551333872271625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035400390625, + "kl": 0.0072919184603961185, + "learning_rate": 1.2e-07, + "loss": 0.0003, + "num_tokens": 91775819.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 69.625, + "completions/mean_terminated_length": 69.625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.45524887400392655, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.0625, + "kl": 0.004701890819887922, + "learning_rate": 1.1799999999999998e-07, + "loss": 0.0002, + "num_tokens": 91789983.0, + "reward": 3.875, + "reward_std": 0.7071067690849304, + "rewards/reward_fn/mean": 3.875, + "rewards/reward_fn/std": 0.7071067690849304, + "step": 3942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 91.09375, + "completions/mean_terminated_length": 91.09375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.4553643607806906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.0074029062561749015, + "learning_rate": 1.16e-07, + "loss": 0.0003, + "num_tokens": 91814274.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 100.0, + "completions/max_terminated_length": 100.0, + "completions/mean_length": 77.09375, + "completions/mean_terminated_length": 77.09375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.4554798475574547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458984375, + "kl": 0.00319634010156733, + "learning_rate": 1.14e-07, + "loss": 0.0001, + "num_tokens": 91837541.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 152.84375, + "completions/mean_terminated_length": 152.84375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.4555953343342187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.008999963298265357, + "learning_rate": 1.12e-07, + "loss": 0.0004, + "num_tokens": 91858944.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 212.03125, + "completions/mean_terminated_length": 212.03125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.45571082111098277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.008396740449825302, + "learning_rate": 1.0999999999999999e-07, + "loss": 0.0003, + "num_tokens": 91876001.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 86.5625, + "completions/mean_terminated_length": 86.5625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.45582630788774686, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3125, + "kl": 0.00864203388482565, + "learning_rate": 1.0799999999999999e-07, + "loss": 0.0003, + "num_tokens": 91900019.0, + "reward": 3.9337120056152344, + "reward_std": 0.3749813735485077, + "rewards/reward_fn/mean": 3.9337120056152344, + "rewards/reward_fn/std": 0.3749813139438629, + "step": 3947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 209.59375, + "completions/mean_terminated_length": 209.59375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.4559417946645109, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.009333669222542085, + "learning_rate": 1.06e-07, + "loss": 0.0004, + "num_tokens": 91918662.0, + "reward": 3.929936408996582, + "reward_std": 0.39634019136428833, + "rewards/reward_fn/mean": 3.929936408996582, + "rewards/reward_fn/std": 0.39634019136428833, + "step": 3948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 188.8125, + "completions/mean_terminated_length": 188.8125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.456057281441275, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.014767863147426397, + "learning_rate": 1.0399999999999999e-07, + "loss": 0.0006, + "num_tokens": 91937120.0, + "reward": 3.9351770877838135, + "reward_std": 0.3666941225528717, + "rewards/reward_fn/mean": 3.9351770877838135, + "rewards/reward_fn/std": 0.3666941523551941, + "step": 3949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 243.96875, + "completions/mean_terminated_length": 243.96875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.45617276821803904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.009399727176059969, + "learning_rate": 1.0199999999999999e-07, + "loss": 0.0004, + "num_tokens": 91955519.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 88.90625, + "completions/mean_terminated_length": 88.90625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.4562882549948031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.01342821356956847, + "learning_rate": 1e-07, + "loss": 0.0005, + "num_tokens": 91967868.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 73.34375, + "completions/mean_terminated_length": 73.34375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.4564037417715672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034423828125, + "kl": 0.0030063903777772794, + "learning_rate": 9.8e-08, + "loss": 0.0001, + "num_tokens": 91992903.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 121.90625, + "completions/mean_terminated_length": 121.90625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.4565192285483312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.007840782112907618, + "learning_rate": 9.6e-08, + "loss": 0.0003, + "num_tokens": 92008164.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 95.15625, + "completions/mean_terminated_length": 95.15625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.45663471532509525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.004551149020699086, + "learning_rate": 9.4e-08, + "loss": 0.0002, + "num_tokens": 92025097.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 242.5625, + "completions/mean_terminated_length": 242.5625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.45675020210185935, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.012559315073303878, + "learning_rate": 9.199999999999999e-08, + "loss": 0.0005, + "num_tokens": 92045531.0, + "reward": 3.3042662143707275, + "reward_std": 1.0486494302749634, + "rewards/reward_fn/mean": 3.3042662143707275, + "rewards/reward_fn/std": 1.0486493110656738, + "step": 3955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 94.0, + "completions/max_terminated_length": 94.0, + "completions/mean_length": 63.78125, + "completions/mean_terminated_length": 63.78125, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.4568656888786234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.0055558111089339945, + "learning_rate": 9e-08, + "loss": 0.0002, + "num_tokens": 92067412.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 427.1875, + "completions/mean_terminated_length": 374.9032287597656, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.4569811756553875, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87109375, + "kl": 0.00765185591444606, + "learning_rate": 8.8e-08, + "loss": 0.0003, + "num_tokens": 92100666.0, + "reward": 3.111621856689453, + "reward_std": 0.7421124577522278, + "rewards/reward_fn/mean": 3.111621856689453, + "rewards/reward_fn/std": 0.7421124577522278, + "step": 3957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 96.0, + "completions/max_terminated_length": 96.0, + "completions/mean_length": 64.1875, + "completions/mean_terminated_length": 64.1875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.4570966624321515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.009020385945404996, + "learning_rate": 8.599999999999999e-08, + "loss": 0.0004, + "num_tokens": 92116448.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 159.15625, + "completions/mean_terminated_length": 159.15625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.45721214920891556, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015625, + "kl": 0.010762580088339746, + "learning_rate": 8.4e-08, + "loss": 0.0004, + "num_tokens": 92140005.0, + "reward": 3.81577467918396, + "reward_std": 0.4350939989089966, + "rewards/reward_fn/mean": 3.81577467918396, + "rewards/reward_fn/std": 0.4350939989089966, + "step": 3959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 508.5, + "completions/mean_terminated_length": 508.5, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.45732763598567966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03466796875, + "kl": 0.008558266286854632, + "learning_rate": 8.2e-08, + "loss": 0.0003, + "num_tokens": 92166101.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 73.21875, + "completions/mean_terminated_length": 73.21875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.4574431227624437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.016118558080052026, + "learning_rate": 8e-08, + "loss": 0.0006, + "num_tokens": 92181020.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 153.75, + "completions/mean_terminated_length": 92.64515686035156, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.45755860953920774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.77734375, + "kl": 0.0044313646176306065, + "learning_rate": 7.8e-08, + "loss": 0.0002, + "num_tokens": 92205268.0, + "reward": 3.931942939758301, + "reward_std": 0.3849884867668152, + "rewards/reward_fn/mean": 3.931942939758301, + "rewards/reward_fn/std": 0.3849884271621704, + "step": 3962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 229.625, + "completions/mean_terminated_length": 229.625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.45767409631597183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.00631932909891475, + "learning_rate": 7.599999999999999e-08, + "loss": 0.0003, + "num_tokens": 92234792.0, + "reward": 3.765263795852661, + "reward_std": 0.35432666540145874, + "rewards/reward_fn/mean": 3.765263795852661, + "rewards/reward_fn/std": 0.35432666540145874, + "step": 3963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 134.84375, + "completions/mean_terminated_length": 134.84375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.4577895830927359, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.009716990796732716, + "learning_rate": 7.399999999999999e-08, + "loss": 0.0004, + "num_tokens": 92261923.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 105.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 70.03125, + "completions/mean_terminated_length": 70.03125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.45790506986949997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.0080565781063342, + "learning_rate": 7.2e-08, + "loss": 0.0003, + "num_tokens": 92273924.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 120.0, + "completions/max_terminated_length": 120.0, + "completions/mean_length": 90.625, + "completions/mean_terminated_length": 90.625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.458020556646264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.015336253360146657, + "learning_rate": 7e-08, + "loss": 0.0006, + "num_tokens": 92293976.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 182.03125, + "completions/mean_terminated_length": 182.03125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.45813604342302805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.010667149574146606, + "learning_rate": 6.8e-08, + "loss": 0.0004, + "num_tokens": 92322169.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 86.0, + "completions/max_terminated_length": 86.0, + "completions/mean_length": 60.6875, + "completions/mean_terminated_length": 60.6875, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.45825153019979215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.004837974273868895, + "learning_rate": 6.6e-08, + "loss": 0.0002, + "num_tokens": 92340495.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 203.71875, + "completions/mean_terminated_length": 203.71875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.4583670169765562, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.009444959374377504, + "learning_rate": 6.4e-08, + "loss": 0.0004, + "num_tokens": 92362086.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 147.5, + "completions/mean_terminated_length": 147.5, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.4584825037533202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1640625, + "kl": 0.01135790471744258, + "learning_rate": 6.2e-08, + "loss": 0.0005, + "num_tokens": 92379062.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 217.90625, + "completions/mean_terminated_length": 217.90625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.4585979905300843, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.011205375762074254, + "learning_rate": 6e-08, + "loss": 0.0004, + "num_tokens": 92395859.0, + "reward": 3.0186784267425537, + "reward_std": 0.2352946251630783, + "rewards/reward_fn/mean": 3.0186784267425537, + "rewards/reward_fn/std": 0.2352946549654007, + "step": 3971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 99.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 73.75, + "completions/mean_terminated_length": 73.75, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.45871347730684836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.007561851685750298, + "learning_rate": 5.8e-08, + "loss": 0.0003, + "num_tokens": 92416523.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 101.3125, + "completions/mean_terminated_length": 101.3125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.4588289640836124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.010824413526279386, + "learning_rate": 5.6e-08, + "loss": 0.0004, + "num_tokens": 92441493.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 242.96875, + "completions/mean_terminated_length": 242.96875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.4589444508603765, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.020879737741779536, + "learning_rate": 5.3999999999999994e-08, + "loss": 0.0008, + "num_tokens": 92471892.0, + "reward": 3.0467164516448975, + "reward_std": 0.5576600432395935, + "rewards/reward_fn/mean": 3.0467164516448975, + "rewards/reward_fn/std": 0.5576601028442383, + "step": 3974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 135.59375, + "completions/mean_terminated_length": 135.59375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.45905993763714054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.012081760418368503, + "learning_rate": 5.1999999999999996e-08, + "loss": 0.0005, + "num_tokens": 92493799.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 108.21875, + "completions/mean_terminated_length": 108.21875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.45917542441390463, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.328125, + "kl": 0.013189972640248016, + "learning_rate": 5e-08, + "loss": 0.0005, + "num_tokens": 92510350.0, + "reward": 3.761218309402466, + "reward_std": 0.714861273765564, + "rewards/reward_fn/mean": 3.761218309402466, + "rewards/reward_fn/std": 0.714861273765564, + "step": 3976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 295.28125, + "completions/mean_terminated_length": 295.28125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.45929091119066867, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.01677210755588021, + "learning_rate": 4.8e-08, + "loss": 0.0007, + "num_tokens": 92536567.0, + "reward": 3.825528860092163, + "reward_std": 0.580507755279541, + "rewards/reward_fn/mean": 3.825528860092163, + "rewards/reward_fn/std": 0.5805078148841858, + "step": 3977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 258.75, + "completions/mean_terminated_length": 258.75, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.4594063979674327, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.008904227201128379, + "learning_rate": 4.5999999999999995e-08, + "loss": 0.0004, + "num_tokens": 92568911.0, + "reward": 3.780263900756836, + "reward_std": 0.6941746473312378, + "rewards/reward_fn/mean": 3.780263900756836, + "rewards/reward_fn/std": 0.6941746473312378, + "step": 3978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 214.9375, + "completions/mean_terminated_length": 214.9375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.4595218847441968, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.01174291066126898, + "learning_rate": 4.4e-08, + "loss": 0.0005, + "num_tokens": 92597741.0, + "reward": 3.9749627113342285, + "reward_std": 0.14163289964199066, + "rewards/reward_fn/mean": 3.9749627113342285, + "rewards/reward_fn/std": 0.14163288474082947, + "step": 3979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 333.09375, + "completions/mean_terminated_length": 333.09375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.45963737152096085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.01034080887620803, + "learning_rate": 4.2e-08, + "loss": 0.0004, + "num_tokens": 92631920.0, + "reward": 2.9278621673583984, + "reward_std": 0.6050671339035034, + "rewards/reward_fn/mean": 2.9278621673583984, + "rewards/reward_fn/std": 0.6050671339035034, + "step": 3980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 149.0, + "completions/mean_terminated_length": 149.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.4597528582977249, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "kl": 0.013360370438022073, + "learning_rate": 4e-08, + "loss": 0.0005, + "num_tokens": 92649392.0, + "reward": 3.6538267135620117, + "reward_std": 0.5223504304885864, + "rewards/reward_fn/mean": 3.6538267135620117, + "rewards/reward_fn/std": 0.5223503708839417, + "step": 3981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 136.5, + "completions/mean_terminated_length": 136.5, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.459868345074489, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.0034071858308379888, + "learning_rate": 3.7999999999999996e-08, + "loss": 0.0001, + "num_tokens": 92680224.0, + "reward": 3.9334218502044678, + "reward_std": 0.3766232430934906, + "rewards/reward_fn/mean": 3.9334218502044678, + "rewards/reward_fn/std": 0.3766232430934906, + "step": 3982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 139.0, + "completions/max_terminated_length": 139.0, + "completions/mean_length": 94.3125, + "completions/mean_terminated_length": 94.3125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.459983831851253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05712890625, + "kl": 0.00405117012996925, + "learning_rate": 3.6e-08, + "loss": 0.0002, + "num_tokens": 92698218.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 333.25, + "completions/mean_terminated_length": 333.25, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.4600993186280171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.007808665992342867, + "learning_rate": 3.4e-08, + "loss": 0.0003, + "num_tokens": 92720466.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 75.0, + "completions/max_terminated_length": 75.0, + "completions/mean_length": 62.34375, + "completions/mean_terminated_length": 62.34375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.46021480540478116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.00486275294679217, + "learning_rate": 3.2e-08, + "loss": 0.0002, + "num_tokens": 92734557.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 128.875, + "completions/mean_terminated_length": 128.875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.4603302921815452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.01439208717783913, + "learning_rate": 3e-08, + "loss": 0.0006, + "num_tokens": 92750521.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 85.6875, + "completions/mean_terminated_length": 85.6875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.4604457789583093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.005547038199438248, + "learning_rate": 2.8e-08, + "loss": 0.0002, + "num_tokens": 92772335.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 399.84375, + "completions/mean_terminated_length": 399.84375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.46056126573507333, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.012243278324604034, + "learning_rate": 2.5999999999999998e-08, + "loss": 0.0005, + "num_tokens": 92798026.0, + "reward": 3.790771961212158, + "reward_std": 0.6609636545181274, + "rewards/reward_fn/mean": 3.790771961212158, + "rewards/reward_fn/std": 0.6609637141227722, + "step": 3988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 123.21875, + "completions/mean_terminated_length": 123.21875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.4606767525118374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23828125, + "kl": 0.04014633361657616, + "learning_rate": 2.4e-08, + "loss": 0.0016, + "num_tokens": 92823665.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 257.4375, + "completions/mean_terminated_length": 257.4375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.46079223928860147, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.012430441071046516, + "learning_rate": 2.2e-08, + "loss": 0.0005, + "num_tokens": 92842943.0, + "reward": 3.8528456687927246, + "reward_std": 0.423977255821228, + "rewards/reward_fn/mean": 3.8528456687927246, + "rewards/reward_fn/std": 0.42397722601890564, + "step": 3990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 160.625, + "completions/mean_terminated_length": 160.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.4609077260653655, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.01038073325617006, + "learning_rate": 2e-08, + "loss": 0.0004, + "num_tokens": 92866227.0, + "reward": 3.7630927562713623, + "reward_std": 0.42922061681747437, + "rewards/reward_fn/mean": 3.7630927562713623, + "rewards/reward_fn/std": 0.42922061681747437, + "step": 3991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 286.6875, + "completions/mean_terminated_length": 286.6875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.4610232128421296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.016426668036729097, + "learning_rate": 1.8e-08, + "loss": 0.0007, + "num_tokens": 92894409.0, + "reward": 2.917135715484619, + "reward_std": 0.22629860043525696, + "rewards/reward_fn/mean": 2.917135715484619, + "rewards/reward_fn/std": 0.22629858553409576, + "step": 3992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 75.3125, + "completions/mean_terminated_length": 75.3125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.46113869961889364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.0060594719670916675, + "learning_rate": 1.6e-08, + "loss": 0.0002, + "num_tokens": 92920147.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 242.375, + "completions/mean_terminated_length": 242.375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.4612541863956577, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.008020058576221345, + "learning_rate": 1.4e-08, + "loss": 0.0003, + "num_tokens": 92943007.0, + "reward": 2.9559385776519775, + "reward_std": 0.40547409653663635, + "rewards/reward_fn/mean": 2.9559385776519775, + "rewards/reward_fn/std": 0.40547409653663635, + "step": 3994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 249.1875, + "completions/mean_terminated_length": 249.1875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.4613696731724218, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.008360024832654744, + "learning_rate": 1.2e-08, + "loss": 0.0003, + "num_tokens": 92969445.0, + "reward": 3.5712637901306152, + "reward_std": 0.5433269739151001, + "rewards/reward_fn/mean": 3.5712637901306152, + "rewards/reward_fn/std": 0.5433270335197449, + "step": 3995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 266.71875, + "completions/mean_terminated_length": 266.71875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.4614851599491858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.039224461164849345, + "learning_rate": 1e-08, + "loss": 0.0016, + "num_tokens": 92992892.0, + "reward": 3.9227213859558105, + "reward_std": 0.43715354800224304, + "rewards/reward_fn/mean": 3.9227213859558105, + "rewards/reward_fn/std": 0.43715354800224304, + "step": 3996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 127.21875, + "completions/mean_terminated_length": 127.21875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.46160064672594986, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.014328442310215905, + "learning_rate": 8e-09, + "loss": 0.0006, + "num_tokens": 93016451.0, + "reward": 3.669004440307617, + "reward_std": 0.4720156192779541, + "rewards/reward_fn/mean": 3.669004440307617, + "rewards/reward_fn/std": 0.4720155894756317, + "step": 3997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 225.0, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.46171613350271395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.007786520276567899, + "learning_rate": 6e-09, + "loss": 0.0003, + "num_tokens": 93031715.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 236.09375, + "completions/mean_terminated_length": 236.09375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.461831620279478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.010726699765655212, + "learning_rate": 4e-09, + "loss": 0.0004, + "num_tokens": 93051238.0, + "reward": 4.0, + "reward_std": 0.0, + "rewards/reward_fn/mean": 4.0, + "rewards/reward_fn/std": 0.0, + "step": 3999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 85.53125, + "completions/mean_terminated_length": 85.53125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.46194710705624203, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.00555547166368342, + "learning_rate": 2e-09, + "loss": 0.0002, + "num_tokens": 93076759.0, + "reward": 2.892409324645996, + "reward_std": 0.04013507440686226, + "rewards/reward_fn/mean": 2.892409324645996, + "rewards/reward_fn/std": 0.04013512283563614, + "step": 4000 + } + ], + "logging_steps": 1, + "max_steps": 4000, + "num_input_tokens_seen": 93076759, + "num_train_epochs": 1, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}