{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.85546875, "eval_steps": 500, "global_step": 950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5980451107025146, "epoch": 0.001953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23979568481445312, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 4704.0, "reward": 0.6000000238418579, "reward_std": 0.5153923630714417, "rewards/reward_func/mean": 0.6000000238418579, "rewards/reward_func/std": 0.49902763962745667, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8363598696887493, "epoch": 0.00390625, "frac_reward_zero_std": 0.5, "grad_norm": 0.15716730058193207, "learning_rate": 4.9947368421052636e-06, "loss": -0.0, "num_tokens": 9408.0, "reward": 0.625, "reward_std": 0.25, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7976906187832355, "epoch": 0.005859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21684496104717255, "learning_rate": 4.989473684210527e-06, "loss": -0.0, "num_tokens": 13960.0, "reward": 0.8697500228881836, "reward_std": 0.2605000138282776, "rewards/reward_func/mean": 0.8697500228881836, "rewards/reward_func/std": 0.3517392575740814, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5398289896547794, "epoch": 0.0078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.16021442413330078, "learning_rate": 4.98421052631579e-06, "loss": -0.0, "num_tokens": 18624.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4209697637706995, "epoch": 0.009765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.16803590953350067, "learning_rate": 4.978947368421053e-06, "loss": 0.0, "num_tokens": 23392.0, "reward": 0.3655624985694885, "reward_std": 0.5249491930007935, "rewards/reward_func/mean": 0.3655624985694885, "rewards/reward_func/std": 0.5012756586074829, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4214139562100172, "epoch": 0.01171875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1561802625656128, "learning_rate": 4.973684210526316e-06, "loss": 0.0, "num_tokens": 28168.0, "reward": 0.1210624948143959, "reward_std": 0.2379765659570694, "rewards/reward_func/mean": 0.1210624948143959, "rewards/reward_func/std": 0.33739402890205383, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6921200603246689, "epoch": 0.013671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18477526307106018, "learning_rate": 4.968421052631579e-06, "loss": 0.0, "num_tokens": 33032.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6270075961947441, "epoch": 0.015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19665871560573578, "learning_rate": 4.9631578947368426e-06, "loss": -0.0, "num_tokens": 37720.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8109602108597755, "epoch": 0.017578125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1696549654006958, "learning_rate": 4.957894736842106e-06, "loss": 0.0, "num_tokens": 42208.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6766198761761189, "epoch": 0.01953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22143109142780304, "learning_rate": 4.952631578947369e-06, "loss": 0.0, "num_tokens": 46688.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7187379151582718, "epoch": 0.021484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18868106603622437, "learning_rate": 4.947368421052632e-06, "loss": 0.0, "num_tokens": 51312.0, "reward": 0.48899999260902405, "reward_std": 0.48899999260902405, "rewards/reward_func/mean": 0.48899999260902405, "rewards/reward_func/std": 0.5227630138397217, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6034604460000992, "epoch": 0.0234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2152688056230545, "learning_rate": 4.942105263157895e-06, "loss": 0.0, "num_tokens": 55928.0, "reward": 0.6254249811172485, "reward_std": 0.5333645343780518, "rewards/reward_func/mean": 0.6254249811172485, "rewards/reward_func/std": 0.5111108422279358, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6899241618812084, "epoch": 0.025390625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1043475940823555, "learning_rate": 4.936842105263158e-06, "loss": 0.0, "num_tokens": 60568.0, "reward": 0.12229999899864197, "reward_std": 0.24459999799728394, "rewards/reward_func/mean": 0.12229999899864197, "rewards/reward_func/std": 0.34591662883758545, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5026568742468953, "epoch": 0.02734375, "frac_reward_zero_std": 0.5, "grad_norm": 0.13232570886611938, "learning_rate": 4.9315789473684215e-06, "loss": 0.0, "num_tokens": 65352.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/reward_func/mean": 0.0031250000465661287, "rewards/reward_func/std": 0.008838835172355175, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6425582990050316, "epoch": 0.029296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23602722585201263, "learning_rate": 4.926315789473685e-06, "loss": -0.0, "num_tokens": 70200.0, "reward": 0.6197500228881836, "reward_std": 0.5353738069534302, "rewards/reward_func/mean": 0.6197500228881836, "rewards/reward_func/std": 0.5133981704711914, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8550453297793865, "epoch": 0.03125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21525943279266357, "learning_rate": 4.921052631578948e-06, "loss": 0.0, "num_tokens": 74696.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6007186323404312, "epoch": 0.033203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2051023691892624, "learning_rate": 4.915789473684211e-06, "loss": 0.0, "num_tokens": 79168.0, "reward": 0.7473000288009644, "reward_std": 0.2940751314163208, "rewards/reward_func/mean": 0.7473000288009644, "rewards/reward_func/std": 0.4613037705421448, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.46435519494116306, "epoch": 0.03515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20061683654785156, "learning_rate": 4.910526315789474e-06, "loss": -0.0, "num_tokens": 83520.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6039901822805405, "epoch": 0.037109375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.905263157894737e-06, "loss": 0.0, "num_tokens": 88480.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.554688960313797, "epoch": 0.0390625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.9000000000000005e-06, "loss": 0.0, "num_tokens": 93464.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6815345846116543, "epoch": 0.041015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19471591711044312, "learning_rate": 4.894736842105264e-06, "loss": 0.0, "num_tokens": 98096.0, "reward": 0.7252500057220459, "reward_std": 0.48400574922561646, "rewards/reward_func/mean": 0.7252500057220459, "rewards/reward_func/std": 0.4482128620147705, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8857233077287674, "epoch": 0.04296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21319618821144104, "learning_rate": 4.889473684210527e-06, "loss": 0.0, "num_tokens": 102792.0, "reward": 0.7473000288009644, "reward_std": 0.2940751314163208, "rewards/reward_func/mean": 0.7473000288009644, "rewards/reward_func/std": 0.4613037705421448, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8026586882770061, "epoch": 0.044921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21624577045440674, "learning_rate": 4.88421052631579e-06, "loss": 0.0, "num_tokens": 107496.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.591262873262167, "epoch": 0.046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.1991681307554245, "learning_rate": 4.878947368421053e-06, "loss": -0.0, "num_tokens": 111832.0, "reward": 0.3828125, "reward_std": 0.529944896697998, "rewards/reward_func/mean": 0.3828125, "rewards/reward_func/std": 0.5115163922309875, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5665613692253828, "epoch": 0.048828125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14706794917583466, "learning_rate": 4.873684210526316e-06, "loss": -0.0, "num_tokens": 116824.0, "reward": 0.0015625000232830644, "reward_std": 0.0031250000465661287, "rewards/reward_func/mean": 0.0015625000232830644, "rewards/reward_func/std": 0.0044194175861775875, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7586575634777546, "epoch": 0.05078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23515093326568604, "learning_rate": 4.8684210526315795e-06, "loss": -0.0, "num_tokens": 121680.0, "reward": 0.25, "reward_std": 0.5, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.4629100561141968, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5237887464463711, "epoch": 0.052734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.209740549325943, "learning_rate": 4.863157894736843e-06, "loss": -0.0, "num_tokens": 126128.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6274934262037277, "epoch": 0.0546875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.857894736842106e-06, "loss": 0.0, "num_tokens": 130584.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8661725111305714, "epoch": 0.056640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24378839135169983, "learning_rate": 4.852631578947369e-06, "loss": 0.0, "num_tokens": 135048.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6145374327898026, "epoch": 0.05859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21723540127277374, "learning_rate": 4.847368421052631e-06, "loss": -0.0, "num_tokens": 139880.0, "reward": 0.3762750029563904, "reward_std": 0.519599974155426, "rewards/reward_func/mean": 0.3762750029563904, "rewards/reward_func/std": 0.49889329075813293, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.37807827070355415, "epoch": 0.060546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.15673603117465973, "learning_rate": 4.842105263157895e-06, "loss": 0.0, "num_tokens": 144368.0, "reward": 0.34860000014305115, "reward_std": 0.5007524490356445, "rewards/reward_func/mean": 0.34860000014305115, "rewards/reward_func/std": 0.4811137020587921, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.44579252414405346, "epoch": 0.0625, "frac_reward_zero_std": 0.0, "grad_norm": 0.17633330821990967, "learning_rate": 4.8368421052631585e-06, "loss": 0.0, "num_tokens": 149016.0, "reward": 0.4918999969959259, "reward_std": 0.5681688785552979, "rewards/reward_func/mean": 0.4918999969959259, "rewards/reward_func/std": 0.5260374546051025, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.46997769735753536, "epoch": 0.064453125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.831578947368422e-06, "loss": 0.0, "num_tokens": 153800.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5178538374602795, "epoch": 0.06640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.18413496017456055, "learning_rate": 4.826315789473685e-06, "loss": 0.0, "num_tokens": 158472.0, "reward": 0.3675000071525574, "reward_std": 0.5236751437187195, "rewards/reward_func/mean": 0.3675000071525574, "rewards/reward_func/std": 0.5075360536575317, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49724796041846275, "epoch": 0.068359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.14860549569129944, "learning_rate": 4.821052631578948e-06, "loss": 0.0, "num_tokens": 163656.0, "reward": 0.25, "reward_std": 0.5, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.4629100561141968, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7501945123076439, "epoch": 0.0703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22438675165176392, "learning_rate": 4.815789473684211e-06, "loss": 0.0, "num_tokens": 168360.0, "reward": 0.7473000288009644, "reward_std": 0.2940751314163208, "rewards/reward_func/mean": 0.7473000288009644, "rewards/reward_func/std": 0.4613037705421448, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8743213079869747, "epoch": 0.072265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2222360074520111, "learning_rate": 4.8105263157894735e-06, "loss": 0.0, "num_tokens": 172832.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5445486754179001, "epoch": 0.07421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20508722960948944, "learning_rate": 4.8052631578947375e-06, "loss": -0.0, "num_tokens": 177192.0, "reward": 0.25468748807907104, "reward_std": 0.49703317880630493, "rewards/reward_func/mean": 0.25468748807907104, "rewards/reward_func/std": 0.4601987898349762, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6154294572770596, "epoch": 0.076171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2429991364479065, "learning_rate": 4.800000000000001e-06, "loss": 0.0, "num_tokens": 181784.0, "reward": 0.6260000467300415, "reward_std": 0.5242137908935547, "rewards/reward_func/mean": 0.6259999871253967, "rewards/reward_func/std": 0.5048788785934448, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.469124692492187, "epoch": 0.078125, "frac_reward_zero_std": 0.5, "grad_norm": 0.09975184500217438, "learning_rate": 4.794736842105264e-06, "loss": 0.0, "num_tokens": 186136.0, "reward": 0.375, "reward_std": 0.25, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7508979439735413, "epoch": 0.080078125, "frac_reward_zero_std": 0.5, "grad_norm": 0.20132674276828766, "learning_rate": 4.789473684210527e-06, "loss": 0.0, "num_tokens": 190624.0, "reward": 0.8675000071525574, "reward_std": 0.2454078197479248, "rewards/reward_func/mean": 0.8675000071525574, "rewards/reward_func/std": 0.35115116834640503, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6970130242407322, "epoch": 0.08203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.15729251503944397, "learning_rate": 4.78421052631579e-06, "loss": 0.0, "num_tokens": 195104.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7774740010499954, "epoch": 0.083984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2186206728219986, "learning_rate": 4.778947368421053e-06, "loss": 0.0, "num_tokens": 199936.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.43700238317251205, "epoch": 0.0859375, "frac_reward_zero_std": 0.5, "grad_norm": 0.094396211206913, "learning_rate": 4.773684210526316e-06, "loss": -0.0, "num_tokens": 204736.0, "reward": 0.11949999630451202, "reward_std": 0.23899999260902405, "rewards/reward_func/mean": 0.11949999630451202, "rewards/reward_func/std": 0.33799704909324646, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.32771568559110165, "epoch": 0.087890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.1980770379304886, "learning_rate": 4.76842105263158e-06, "loss": -0.0, "num_tokens": 209240.0, "reward": 0.3465000092983246, "reward_std": 0.49655240774154663, "rewards/reward_func/mean": 0.3465000092983246, "rewards/reward_func/std": 0.47824355959892273, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5444277804344893, "epoch": 0.08984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20421306788921356, "learning_rate": 4.763157894736842e-06, "loss": -0.0, "num_tokens": 214088.0, "reward": 0.25511249899864197, "reward_std": 0.29742252826690674, "rewards/reward_func/mean": 0.25511249899864197, "rewards/reward_func/std": 0.4534413814544678, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47741230949759483, "epoch": 0.091796875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1492561548948288, "learning_rate": 4.757894736842106e-06, "loss": 0.0, "num_tokens": 218680.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.4629100561141968, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7896424978971481, "epoch": 0.09375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23126615583896637, "learning_rate": 4.752631578947369e-06, "loss": 0.0, "num_tokens": 223168.0, "reward": 0.3675000071525574, "reward_std": 0.5302826166152954, "rewards/reward_func/mean": 0.3675000071525574, "rewards/reward_func/std": 0.5075360536575317, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6476297667250037, "epoch": 0.095703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.18310336768627167, "learning_rate": 4.747368421052632e-06, "loss": 0.0, "num_tokens": 228344.0, "reward": 0.49729999899864197, "reward_std": 0.5742666125297546, "rewards/reward_func/mean": 0.49729999899864197, "rewards/reward_func/std": 0.531683087348938, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5440277047455311, "epoch": 0.09765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19677050411701202, "learning_rate": 4.7421052631578954e-06, "loss": 0.0, "num_tokens": 232792.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.43108594603836536, "epoch": 0.099609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.1726936399936676, "learning_rate": 4.736842105263158e-06, "loss": 0.0, "num_tokens": 237968.0, "reward": 0.2593750059604645, "reward_std": 0.49439018964767456, "rewards/reward_func/mean": 0.2593750059604645, "rewards/reward_func/std": 0.45785558223724365, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8471032381057739, "epoch": 0.1015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.1951897144317627, "learning_rate": 4.731578947368422e-06, "loss": 0.0, "num_tokens": 242688.0, "reward": 0.3828125, "reward_std": 0.5339096784591675, "rewards/reward_func/mean": 0.3828125, "rewards/reward_func/std": 0.5115163922309875, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9184307493269444, "epoch": 0.103515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.27299290895462036, "learning_rate": 4.726315789473684e-06, "loss": 0.0, "num_tokens": 247168.0, "reward": 0.3812499940395355, "reward_std": 0.5347907543182373, "rewards/reward_func/mean": 0.3812499940395355, "rewards/reward_func/std": 0.5126523971557617, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6415830589830875, "epoch": 0.10546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22410304844379425, "learning_rate": 4.721052631578948e-06, "loss": 0.0, "num_tokens": 251656.0, "reward": 0.5, "reward_std": 0.5, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5345224738121033, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8139438554644585, "epoch": 0.107421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22878246009349823, "learning_rate": 4.71578947368421e-06, "loss": 0.0, "num_tokens": 256368.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7665090560913086, "epoch": 0.109375, "frac_reward_zero_std": 0.5, "grad_norm": 0.13540033996105194, "learning_rate": 4.710526315789474e-06, "loss": 0.0, "num_tokens": 260800.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7670854441821575, "epoch": 0.111328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20830559730529785, "learning_rate": 4.705263157894738e-06, "loss": 0.0, "num_tokens": 265296.0, "reward": 0.5015624761581421, "reward_std": 0.5755573511123657, "rewards/reward_func/mean": 0.5015624761581421, "rewards/reward_func/std": 0.5328678488731384, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4469104828312993, "epoch": 0.11328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20230792462825775, "learning_rate": 4.7e-06, "loss": 0.0, "num_tokens": 269616.0, "reward": 0.25468748807907104, "reward_std": 0.49703317880630493, "rewards/reward_func/mean": 0.25468748807907104, "rewards/reward_func/std": 0.4601987898349762, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.43488175235688686, "epoch": 0.115234375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1155906692147255, "learning_rate": 4.694736842105264e-06, "loss": 0.0, "num_tokens": 274400.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/reward_func/mean": 0.0031250000465661287, "rewards/reward_func/std": 0.008838835172355175, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.538303017616272, "epoch": 0.1171875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1079397052526474, "learning_rate": 4.689473684210526e-06, "loss": 0.0, "num_tokens": 279360.0, "reward": 0.11620000004768372, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.11620000004768372, "rewards/reward_func/std": 0.32866325974464417, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4871381111443043, "epoch": 0.119140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20236212015151978, "learning_rate": 4.68421052631579e-06, "loss": 0.0, "num_tokens": 284272.0, "reward": 0.0234375, "reward_std": 0.04291563481092453, "rewards/reward_func/mean": 0.0234375, "rewards/reward_func/std": 0.039774756878614426, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.779204398393631, "epoch": 0.12109375, "frac_reward_zero_std": 0.5, "grad_norm": 0.14050957560539246, "learning_rate": 4.6789473684210525e-06, "loss": 0.0, "num_tokens": 288760.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5929105505347252, "epoch": 0.123046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22070521116256714, "learning_rate": 4.6736842105263166e-06, "loss": 0.0, "num_tokens": 293440.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5897692143917084, "epoch": 0.125, "frac_reward_zero_std": 0.0, "grad_norm": 0.1809045523405075, "learning_rate": 4.668421052631579e-06, "loss": 0.0, "num_tokens": 298104.0, "reward": 0.6265624761581421, "reward_std": 0.5368822813034058, "rewards/reward_func/mean": 0.6265624761581421, "rewards/reward_func/std": 0.515407145023346, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5395209770649672, "epoch": 0.126953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21770712733268738, "learning_rate": 4.663157894736842e-06, "loss": 0.0, "num_tokens": 302952.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5718973986804485, "epoch": 0.12890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19175319373607635, "learning_rate": 4.657894736842106e-06, "loss": 0.0, "num_tokens": 307576.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3761954288929701, "epoch": 0.130859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.1708212047815323, "learning_rate": 4.652631578947368e-06, "loss": -0.0, "num_tokens": 312424.0, "reward": 0.24729999899864197, "reward_std": 0.49459999799728394, "rewards/reward_func/mean": 0.24729999899864197, "rewards/reward_func/std": 0.4579469859600067, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4973205029964447, "epoch": 0.1328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.17592303454875946, "learning_rate": 4.647368421052632e-06, "loss": 0.0, "num_tokens": 317264.0, "reward": 0.24459999799728394, "reward_std": 0.48919999599456787, "rewards/reward_func/mean": 0.24459999799728394, "rewards/reward_func/std": 0.45291122794151306, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8153446540236473, "epoch": 0.134765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2120014876127243, "learning_rate": 4.642105263157895e-06, "loss": 0.0, "num_tokens": 321760.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.724060770124197, "epoch": 0.13671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22452718019485474, "learning_rate": 4.636842105263159e-06, "loss": -0.0, "num_tokens": 326440.0, "reward": 0.25, "reward_std": 0.5, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.4629100561141968, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5170768238604069, "epoch": 0.138671875, "frac_reward_zero_std": 0.5, "grad_norm": 0.12857870757579803, "learning_rate": 4.631578947368421e-06, "loss": 0.0, "num_tokens": 331632.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.4629100561141968, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.688769031316042, "epoch": 0.140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22918672859668732, "learning_rate": 4.626315789473684e-06, "loss": 0.0, "num_tokens": 336328.0, "reward": 0.614799976348877, "reward_std": 0.5321913957595825, "rewards/reward_func/mean": 0.614799976348877, "rewards/reward_func/std": 0.5094863176345825, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8502191081643105, "epoch": 0.142578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23828759789466858, "learning_rate": 4.621052631578948e-06, "loss": 0.0, "num_tokens": 340912.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8155621662735939, "epoch": 0.14453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20899727940559387, "learning_rate": 4.6157894736842105e-06, "loss": 0.0, "num_tokens": 345640.0, "reward": 0.5, "reward_std": 0.5, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5345224738121033, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5054981037974358, "epoch": 0.146484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.13521376252174377, "learning_rate": 4.6105263157894745e-06, "loss": 0.0, "num_tokens": 350272.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7490544784814119, "epoch": 0.1484375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.605263157894737e-06, "loss": 0.0, "num_tokens": 354896.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6609128974378109, "epoch": 0.150390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2172318398952484, "learning_rate": 4.600000000000001e-06, "loss": 0.0, "num_tokens": 359656.0, "reward": 0.36480000615119934, "reward_std": 0.5248825550079346, "rewards/reward_func/mean": 0.36480000615119934, "rewards/reward_func/std": 0.5037338137626648, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.629855677485466, "epoch": 0.15234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2275475412607193, "learning_rate": 4.594736842105263e-06, "loss": -0.0, "num_tokens": 363984.0, "reward": 0.5062500238418579, "reward_std": 0.570318341255188, "rewards/reward_func/mean": 0.5062500238418579, "rewards/reward_func/std": 0.5280946493148804, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6377979442477226, "epoch": 0.154296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18542706966400146, "learning_rate": 4.589473684210526e-06, "loss": 0.0, "num_tokens": 368672.0, "reward": 0.5066750049591064, "reward_std": 0.4890046715736389, "rewards/reward_func/mean": 0.5066750049591064, "rewards/reward_func/std": 0.5222390294075012, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6481011174619198, "epoch": 0.15625, "frac_reward_zero_std": 0.0, "grad_norm": 0.1995760053396225, "learning_rate": 4.5842105263157895e-06, "loss": 0.0, "num_tokens": 373080.0, "reward": 0.5015624761581421, "reward_std": 0.5755573511123657, "rewards/reward_func/mean": 0.5015624761581421, "rewards/reward_func/std": 0.5328678488731384, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5756206288933754, "epoch": 0.158203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.18328534066677094, "learning_rate": 4.578947368421053e-06, "loss": 0.0, "num_tokens": 377520.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.564460570923984, "epoch": 0.16015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.16234658658504486, "learning_rate": 4.573684210526317e-06, "loss": 0.0, "num_tokens": 382704.0, "reward": 0.2447499930858612, "reward_std": 0.4894999861717224, "rewards/reward_func/mean": 0.2447499930858612, "rewards/reward_func/std": 0.4533279538154602, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4652852639555931, "epoch": 0.162109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18636715412139893, "learning_rate": 4.568421052631579e-06, "loss": -0.0, "num_tokens": 387048.0, "reward": 0.3812499940395355, "reward_std": 0.531643271446228, "rewards/reward_func/mean": 0.3812499940395355, "rewards/reward_func/std": 0.5126523971557617, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5575841926038265, "epoch": 0.1640625, "frac_reward_zero_std": 0.5, "grad_norm": 0.14538714289665222, "learning_rate": 4.563157894736843e-06, "loss": 0.0, "num_tokens": 391904.0, "reward": 0.25241249799728394, "reward_std": 0.27352577447891235, "rewards/reward_func/mean": 0.25241249799728394, "rewards/reward_func/std": 0.44840875267982483, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8532749712467194, "epoch": 0.166015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20851509273052216, "learning_rate": 4.557894736842105e-06, "loss": 0.0, "num_tokens": 396616.0, "reward": 0.48980000615119934, "reward_std": 0.48325222730636597, "rewards/reward_func/mean": 0.48980000615119934, "rewards/reward_func/std": 0.5239458084106445, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4446020573377609, "epoch": 0.16796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.1834844946861267, "learning_rate": 4.552631578947369e-06, "loss": 0.0, "num_tokens": 401784.0, "reward": 0.12443750351667404, "reward_std": 0.24887500703334808, "rewards/reward_func/mean": 0.12443750351667404, "rewards/reward_func/std": 0.3370656669139862, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8720123656094074, "epoch": 0.169921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19286687672138214, "learning_rate": 4.547368421052632e-06, "loss": 0.0, "num_tokens": 406488.0, "reward": 0.48500001430511475, "reward_std": 0.4904162883758545, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5189825296401978, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6445259749889374, "epoch": 0.171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18657498061656952, "learning_rate": 4.542105263157895e-06, "loss": 0.0, "num_tokens": 411192.0, "reward": 0.8723000288009644, "reward_std": 0.25540000200271606, "rewards/reward_func/mean": 0.8723000288009644, "rewards/reward_func/std": 0.3525434732437134, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6623015515506268, "epoch": 0.173828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21270084381103516, "learning_rate": 4.536842105263158e-06, "loss": -0.0, "num_tokens": 415672.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.623845137655735, "epoch": 0.17578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2535983622074127, "learning_rate": 4.531578947368421e-06, "loss": 0.0, "num_tokens": 419992.0, "reward": 0.390625, "reward_std": 0.5289278030395508, "rewards/reward_func/mean": 0.390625, "rewards/reward_func/std": 0.5054501891136169, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5036896169185638, "epoch": 0.177734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.16919977962970734, "learning_rate": 4.526315789473685e-06, "loss": 0.0, "num_tokens": 424840.0, "reward": 0.3747124969959259, "reward_std": 0.5183161497116089, "rewards/reward_func/mean": 0.3747124969959259, "rewards/reward_func/std": 0.5003470778465271, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5636924169957638, "epoch": 0.1796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22071091830730438, "learning_rate": 4.5210526315789475e-06, "loss": 0.0, "num_tokens": 429592.0, "reward": 0.3709375262260437, "reward_std": 0.5032609701156616, "rewards/reward_func/mean": 0.3709375262260437, "rewards/reward_func/std": 0.4890368580818176, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.609756950289011, "epoch": 0.181640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19224682450294495, "learning_rate": 4.5157894736842115e-06, "loss": 0.0, "num_tokens": 433952.0, "reward": 0.5015624761581421, "reward_std": 0.5755573511123657, "rewards/reward_func/mean": 0.5015624761581421, "rewards/reward_func/std": 0.5328678488731384, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.40662568248808384, "epoch": 0.18359375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.510526315789474e-06, "loss": 0.0, "num_tokens": 439112.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7100704610347748, "epoch": 0.185546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23149752616882324, "learning_rate": 4.505263157894737e-06, "loss": 0.0, "num_tokens": 443864.0, "reward": 0.35249999165534973, "reward_std": 0.5063546299934387, "rewards/reward_func/mean": 0.35249999165534973, "rewards/reward_func/std": 0.4864962100982666, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8094519749283791, "epoch": 0.1875, "frac_reward_zero_std": 0.5, "grad_norm": 0.15361107885837555, "learning_rate": 4.5e-06, "loss": 0.0, "num_tokens": 448584.0, "reward": 0.8723000288009644, "reward_std": 0.24825221300125122, "rewards/reward_func/mean": 0.8723000288009644, "rewards/reward_func/std": 0.3525434732437134, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7366169281303883, "epoch": 0.189453125, "frac_reward_zero_std": 0.5, "grad_norm": 0.150665283203125, "learning_rate": 4.494736842105263e-06, "loss": 0.0, "num_tokens": 453272.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.4629100561141968, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6502304151654243, "epoch": 0.19140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20246288180351257, "learning_rate": 4.489473684210527e-06, "loss": -0.0, "num_tokens": 458144.0, "reward": 0.7447500228881836, "reward_std": 0.4966987073421478, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5997471921145916, "epoch": 0.193359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20762783288955688, "learning_rate": 4.48421052631579e-06, "loss": 0.0, "num_tokens": 463016.0, "reward": 0.6228749752044678, "reward_std": 0.5304585099220276, "rewards/reward_func/mean": 0.6228749752044678, "rewards/reward_func/std": 0.5091015100479126, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7601636275649071, "epoch": 0.1953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24827490746974945, "learning_rate": 4.478947368421054e-06, "loss": 0.0, "num_tokens": 467464.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5260781534016132, "epoch": 0.197265625, "frac_reward_zero_std": 0.5, "grad_norm": 0.15311233699321747, "learning_rate": 4.473684210526316e-06, "loss": -0.0, "num_tokens": 472096.0, "reward": 0.0015625000232830644, "reward_std": 0.0031250000465661287, "rewards/reward_func/mean": 0.0015625000232830644, "rewards/reward_func/std": 0.0044194175861775875, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7099028006196022, "epoch": 0.19921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23158502578735352, "learning_rate": 4.468421052631579e-06, "loss": 0.0, "num_tokens": 476832.0, "reward": 0.5016124844551086, "reward_std": 0.5615566968917847, "rewards/reward_func/mean": 0.5016124844551086, "rewards/reward_func/std": 0.5200324058532715, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6568261608481407, "epoch": 0.201171875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1539800614118576, "learning_rate": 4.463157894736842e-06, "loss": 0.0, "num_tokens": 481592.0, "reward": 0.8300000429153442, "reward_std": 0.24041630327701569, "rewards/reward_func/mean": 0.8300000429153442, "rewards/reward_func/std": 0.33602720499038696, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.48168842773884535, "epoch": 0.203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.19704312086105347, "learning_rate": 4.4578947368421054e-06, "loss": -0.0, "num_tokens": 486440.0, "reward": 0.3668999969959259, "reward_std": 0.5270397663116455, "rewards/reward_func/mean": 0.3668999969959259, "rewards/reward_func/std": 0.5063701272010803, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4500679522752762, "epoch": 0.205078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.15499116480350494, "learning_rate": 4.452631578947369e-06, "loss": -0.0, "num_tokens": 491224.0, "reward": 0.12262499332427979, "reward_std": 0.24110156297683716, "rewards/reward_func/mean": 0.12262499332427979, "rewards/reward_func/std": 0.33678168058395386, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6883682459592819, "epoch": 0.20703125, "frac_reward_zero_std": 0.5, "grad_norm": 0.12739039957523346, "learning_rate": 4.447368421052632e-06, "loss": 0.0, "num_tokens": 495864.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.4629100561141968, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5138243455439806, "epoch": 0.208984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20175091922283173, "learning_rate": 4.442105263157896e-06, "loss": -0.0, "num_tokens": 500720.0, "reward": 0.4923250079154968, "reward_std": 0.5613177418708801, "rewards/reward_func/mean": 0.49232497811317444, "rewards/reward_func/std": 0.5197004675865173, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7696716003119946, "epoch": 0.2109375, "frac_reward_zero_std": 0.5, "grad_norm": 0.19667677581310272, "learning_rate": 4.436842105263158e-06, "loss": 0.0, "num_tokens": 505320.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6834167242050171, "epoch": 0.212890625, "frac_reward_zero_std": 0.5, "grad_norm": 0.19136440753936768, "learning_rate": 4.431578947368421e-06, "loss": 0.0, "num_tokens": 509792.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5321553461253643, "epoch": 0.21484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2594543993473053, "learning_rate": 4.426315789473684e-06, "loss": -0.0, "num_tokens": 514392.0, "reward": 0.629687488079071, "reward_std": 0.5333658456802368, "rewards/reward_func/mean": 0.629687488079071, "rewards/reward_func/std": 0.5112107992172241, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6744099333882332, "epoch": 0.216796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20642893016338348, "learning_rate": 4.4210526315789476e-06, "loss": -0.0, "num_tokens": 519056.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5175492167472839, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6416698023676872, "epoch": 0.21875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22189435362815857, "learning_rate": 4.415789473684211e-06, "loss": 0.0, "num_tokens": 524064.0, "reward": 0.23100000619888306, "reward_std": 0.4619999825954437, "rewards/reward_func/mean": 0.23100000619888306, "rewards/reward_func/std": 0.42808806896209717, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6506899148225784, "epoch": 0.220703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24659857153892517, "learning_rate": 4.410526315789474e-06, "loss": 0.0, "num_tokens": 529056.0, "reward": 0.3370000123977661, "reward_std": 0.483335018157959, "rewards/reward_func/mean": 0.3370000123977661, "rewards/reward_func/std": 0.46515026688575745, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6117045246064663, "epoch": 0.22265625, "frac_reward_zero_std": 0.5, "grad_norm": 0.15469233691692352, "learning_rate": 4.405263157894737e-06, "loss": 0.0, "num_tokens": 533664.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.681044191122055, "epoch": 0.224609375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1914876252412796, "learning_rate": 4.4e-06, "loss": 0.0, "num_tokens": 538104.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6017662808299065, "epoch": 0.2265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2209053784608841, "learning_rate": 4.394736842105263e-06, "loss": 0.0, "num_tokens": 543024.0, "reward": 0.4899500012397766, "reward_std": 0.5516552925109863, "rewards/reward_func/mean": 0.4899500012397766, "rewards/reward_func/std": 0.5108809471130371, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6683652102947235, "epoch": 0.228515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19945447146892548, "learning_rate": 4.3894736842105266e-06, "loss": -0.0, "num_tokens": 547656.0, "reward": 0.3683124780654907, "reward_std": 0.5250316858291626, "rewards/reward_func/mean": 0.3683124780654907, "rewards/reward_func/std": 0.5048869252204895, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3754726406186819, "epoch": 0.23046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19966642558574677, "learning_rate": 4.38421052631579e-06, "loss": 0.0, "num_tokens": 552160.0, "reward": 0.47360000014305115, "reward_std": 0.47124379873275757, "rewards/reward_func/mean": 0.47360000014305115, "rewards/reward_func/std": 0.5068238377571106, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4965004436671734, "epoch": 0.232421875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1419457346200943, "learning_rate": 4.378947368421053e-06, "loss": 0.0, "num_tokens": 557144.0, "reward": 0.11100000143051147, "reward_std": 0.22200000286102295, "rewards/reward_func/mean": 0.11100000143051147, "rewards/reward_func/std": 0.3139553964138031, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.509646050632, "epoch": 0.234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18533200025558472, "learning_rate": 4.373684210526316e-06, "loss": -0.0, "num_tokens": 561792.0, "reward": 0.37892499566078186, "reward_std": 0.5219879150390625, "rewards/reward_func/mean": 0.37892499566078186, "rewards/reward_func/std": 0.5029488205909729, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6187039539217949, "epoch": 0.236328125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14618447422981262, "learning_rate": 4.368421052631579e-06, "loss": 0.0, "num_tokens": 566528.0, "reward": 0.9474999904632568, "reward_std": 0.01500000525265932, "rewards/reward_func/mean": 0.9474999904632568, "rewards/reward_func/std": 0.02121320553123951, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4034488648176193, "epoch": 0.23828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.17633076012134552, "learning_rate": 4.363157894736842e-06, "loss": 0.0, "num_tokens": 571704.0, "reward": 0.7593749761581421, "reward_std": 0.48124998807907104, "rewards/reward_func/mean": 0.7593749761581421, "rewards/reward_func/std": 0.4460016191005707, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6987418737262487, "epoch": 0.240234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.26086556911468506, "learning_rate": 4.3578947368421055e-06, "loss": -0.0, "num_tokens": 576560.0, "reward": 0.6266999840736389, "reward_std": 0.5093674659729004, "rewards/reward_func/mean": 0.6266999840736389, "rewards/reward_func/std": 0.4916507303714752, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7009507194161415, "epoch": 0.2421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2221718728542328, "learning_rate": 4.352631578947369e-06, "loss": -0.0, "num_tokens": 581488.0, "reward": 0.49502497911453247, "reward_std": 0.5644694566726685, "rewards/reward_func/mean": 0.49502497911453247, "rewards/reward_func/std": 0.5226343870162964, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3089421261101961, "epoch": 0.244140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.16174942255020142, "learning_rate": 4.347368421052632e-06, "loss": 0.0, "num_tokens": 585992.0, "reward": 0.6930000185966492, "reward_std": 0.4620679020881653, "rewards/reward_func/mean": 0.6930000185966492, "rewards/reward_func/std": 0.4277917146682739, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6322880983352661, "epoch": 0.24609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19183886051177979, "learning_rate": 4.342105263157895e-06, "loss": -0.0, "num_tokens": 590960.0, "reward": 0.2251249998807907, "reward_std": 0.25995194911956787, "rewards/reward_func/mean": 0.2251250147819519, "rewards/reward_func/std": 0.4091717302799225, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8441327884793282, "epoch": 0.248046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22232002019882202, "learning_rate": 4.336842105263158e-06, "loss": -0.0, "num_tokens": 595496.0, "reward": 0.37229999899864197, "reward_std": 0.5355914831161499, "rewards/reward_func/mean": 0.37229999899864197, "rewards/reward_func/std": 0.5138660669326782, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3662140220403671, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.19537632167339325, "learning_rate": 4.331578947368421e-06, "loss": 0.0, "num_tokens": 599992.0, "reward": 0.4710499942302704, "reward_std": 0.46093741059303284, "rewards/reward_func/mean": 0.4710499942302704, "rewards/reward_func/std": 0.49048370122909546, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6109810192137957, "epoch": 0.251953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20167317986488342, "learning_rate": 4.3263157894736845e-06, "loss": 0.0, "num_tokens": 604848.0, "reward": 0.25439998507499695, "reward_std": 0.47605061531066895, "rewards/reward_func/mean": 0.25439998507499695, "rewards/reward_func/std": 0.44077494740486145, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.765848021954298, "epoch": 0.25390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2271883189678192, "learning_rate": 4.321052631578948e-06, "loss": 0.0, "num_tokens": 609472.0, "reward": 0.49524998664855957, "reward_std": 0.48512208461761475, "rewards/reward_func/mean": 0.49524998664855957, "rewards/reward_func/std": 0.5163409113883972, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6323870681226254, "epoch": 0.255859375, "frac_reward_zero_std": 0.0, "grad_norm": 2.204331636428833, "learning_rate": 4.315789473684211e-06, "loss": 0.0, "num_tokens": 613824.0, "reward": 0.7515624761581421, "reward_std": 0.49687498807907104, "rewards/reward_func/mean": 0.7515624761581421, "rewards/reward_func/std": 0.4600290060043335, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6582241244614124, "epoch": 0.2578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23651830852031708, "learning_rate": 4.310526315789474e-06, "loss": -0.0, "num_tokens": 618512.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6236909478902817, "epoch": 0.259765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.1894795149564743, "learning_rate": 4.305263157894737e-06, "loss": 0.0, "num_tokens": 623488.0, "reward": 0.2150000035762787, "reward_std": 0.4300000071525574, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.39838388562202454, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.23316527949646115, "epoch": 0.26171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.14522254467010498, "learning_rate": 4.3e-06, "loss": 0.0, "num_tokens": 627992.0, "reward": 0.3499000072479248, "reward_std": 0.5033524036407471, "rewards/reward_func/mean": 0.3499000072479248, "rewards/reward_func/std": 0.48291856050491333, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5633776243776083, "epoch": 0.263671875, "frac_reward_zero_std": 0.5, "grad_norm": 0.14741557836532593, "learning_rate": 4.2947368421052635e-06, "loss": 0.0, "num_tokens": 632352.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5170593503862619, "epoch": 0.265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20114511251449585, "learning_rate": 4.289473684210527e-06, "loss": 0.0, "num_tokens": 637200.0, "reward": 0.2691749930381775, "reward_std": 0.28962743282318115, "rewards/reward_func/mean": 0.2691749930381775, "rewards/reward_func/std": 0.44533297419548035, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.35685587860643864, "epoch": 0.267578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.17862118780612946, "learning_rate": 4.28421052631579e-06, "loss": -0.0, "num_tokens": 641688.0, "reward": 0.6972000002861023, "reward_std": 0.46480000019073486, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8764316439628601, "epoch": 0.26953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22691944241523743, "learning_rate": 4.278947368421053e-06, "loss": 0.0, "num_tokens": 646184.0, "reward": 0.49729999899864197, "reward_std": 0.5742666125297546, "rewards/reward_func/mean": 0.49729999899864197, "rewards/reward_func/std": 0.531683087348938, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.703185360878706, "epoch": 0.271484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21261753141880035, "learning_rate": 4.273684210526316e-06, "loss": 0.0, "num_tokens": 651160.0, "reward": 0.4440000057220459, "reward_std": 0.5126870274543762, "rewards/reward_func/mean": 0.4440000057220459, "rewards/reward_func/std": 0.47465598583221436, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7478283271193504, "epoch": 0.2734375, "frac_reward_zero_std": 0.5, "grad_norm": 0.15642641484737396, "learning_rate": 4.268421052631579e-06, "loss": 0.0, "num_tokens": 656016.0, "reward": 0.11140000075101852, "reward_std": 0.22280000150203705, "rewards/reward_func/mean": 0.11140000075101852, "rewards/reward_func/std": 0.3150867819786072, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.507564508356154, "epoch": 0.275390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.15584400296211243, "learning_rate": 4.2631578947368425e-06, "loss": 0.0, "num_tokens": 660816.0, "reward": 0.24137499928474426, "reward_std": 0.45809125900268555, "rewards/reward_func/mean": 0.24137499928474426, "rewards/reward_func/std": 0.42415857315063477, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6022160463035107, "epoch": 0.27734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20966751873493195, "learning_rate": 4.257894736842106e-06, "loss": -0.0, "num_tokens": 665656.0, "reward": 0.7447500228881836, "reward_std": 0.2991751432418823, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9077504724264145, "epoch": 0.279296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.27349957823753357, "learning_rate": 4.252631578947369e-06, "loss": 0.0, "num_tokens": 670144.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5460263751447201, "epoch": 0.28125, "frac_reward_zero_std": 0.0, "grad_norm": 0.17890246212482452, "learning_rate": 4.247368421052632e-06, "loss": 0.0, "num_tokens": 674832.0, "reward": 0.503125011920929, "reward_std": 0.49798667430877686, "rewards/reward_func/mean": 0.503125011920929, "rewards/reward_func/std": 0.5312447547912598, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6752507574856281, "epoch": 0.283203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.17742383480072021, "learning_rate": 4.242105263157895e-06, "loss": 0.0, "num_tokens": 679176.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6691841837018728, "epoch": 0.28515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2150551676750183, "learning_rate": 4.236842105263158e-06, "loss": -0.0, "num_tokens": 683968.0, "reward": 0.7343499660491943, "reward_std": 0.49009713530540466, "rewards/reward_func/mean": 0.7343499660491943, "rewards/reward_func/std": 0.45377910137176514, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.970278836786747, "epoch": 0.287109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.27998873591423035, "learning_rate": 4.2315789473684215e-06, "loss": -0.0, "num_tokens": 688464.0, "reward": 0.37229999899864197, "reward_std": 0.5355914831161499, "rewards/reward_func/mean": 0.37229999899864197, "rewards/reward_func/std": 0.5138660669326782, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.38460469990968704, "epoch": 0.2890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24717585742473602, "learning_rate": 4.226315789473685e-06, "loss": 0.0, "num_tokens": 693256.0, "reward": 0.36281248927116394, "reward_std": 0.5174503326416016, "rewards/reward_func/mean": 0.36281248927116394, "rewards/reward_func/std": 0.4976207911968231, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8782265558838844, "epoch": 0.291015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23354728519916534, "learning_rate": 4.221052631578948e-06, "loss": 0.0, "num_tokens": 697904.0, "reward": 0.3670499920845032, "reward_std": 0.5250914692878723, "rewards/reward_func/mean": 0.3670499920845032, "rewards/reward_func/std": 0.5067015290260315, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6834246851503849, "epoch": 0.29296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2545972764492035, "learning_rate": 4.215789473684211e-06, "loss": -0.0, "num_tokens": 702248.0, "reward": 0.7447500228881836, "reward_std": 0.2991751432418823, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6247103922069073, "epoch": 0.294921875, "frac_reward_zero_std": 0.5, "grad_norm": 0.14536061882972717, "learning_rate": 4.210526315789474e-06, "loss": 0.0, "num_tokens": 707120.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5234713517129421, "epoch": 0.296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2018449902534485, "learning_rate": 4.205263157894737e-06, "loss": 0.0, "num_tokens": 711768.0, "reward": 0.6231000423431396, "reward_std": 0.5223661661148071, "rewards/reward_func/mean": 0.6230999827384949, "rewards/reward_func/std": 0.5024495124816895, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4138908423483372, "epoch": 0.298828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.1805502027273178, "learning_rate": 4.2000000000000004e-06, "loss": -0.0, "num_tokens": 716568.0, "reward": 0.3546624779701233, "reward_std": 0.5078586935997009, "rewards/reward_func/mean": 0.3546624779701233, "rewards/reward_func/std": 0.48623159527778625, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5758881866931915, "epoch": 0.30078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23512139916419983, "learning_rate": 4.194736842105264e-06, "loss": 0.0, "num_tokens": 721352.0, "reward": 0.7323000431060791, "reward_std": 0.4886685013771057, "rewards/reward_func/mean": 0.7323000431060791, "rewards/reward_func/std": 0.45266833901405334, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47453245986253023, "epoch": 0.302734375, "frac_reward_zero_std": 0.5, "grad_norm": 0.15655399858951569, "learning_rate": 4.189473684210527e-06, "loss": 0.0, "num_tokens": 725864.0, "reward": 0.6951000094413757, "reward_std": 0.26594966650009155, "rewards/reward_func/mean": 0.6951000094413757, "rewards/reward_func/std": 0.42906418442726135, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7080131396651268, "epoch": 0.3046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.20565909147262573, "learning_rate": 4.18421052631579e-06, "loss": 0.0, "num_tokens": 730704.0, "reward": 0.23100000619888306, "reward_std": 0.2670717239379883, "rewards/reward_func/mean": 0.23100000619888306, "rewards/reward_func/std": 0.42808806896209717, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 1.0115558728575706, "epoch": 0.306640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.7812044024467468, "learning_rate": 4.178947368421053e-06, "loss": 0.0, "num_tokens": 735200.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5327462665736675, "epoch": 0.30859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18726424872875214, "learning_rate": 4.173684210526316e-06, "loss": -0.0, "num_tokens": 739856.0, "reward": 0.3703624904155731, "reward_std": 0.5275677442550659, "rewards/reward_func/mean": 0.3703624904155731, "rewards/reward_func/std": 0.5077766180038452, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7538893632590771, "epoch": 0.310546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2491677850484848, "learning_rate": 4.1684210526315794e-06, "loss": 0.0, "num_tokens": 744640.0, "reward": 0.24524998664855957, "reward_std": 0.2810765504837036, "rewards/reward_func/mean": 0.24525000154972076, "rewards/reward_func/std": 0.43876922130584717, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.440243948251009, "epoch": 0.3125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2969156801700592, "learning_rate": 4.163157894736843e-06, "loss": 0.0, "num_tokens": 749424.0, "reward": 0.00937500037252903, "reward_std": 0.01875000074505806, "rewards/reward_func/mean": 0.00937500037252903, "rewards/reward_func/std": 0.018600596114993095, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5540930740535259, "epoch": 0.314453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2284710556268692, "learning_rate": 4.157894736842106e-06, "loss": 0.0, "num_tokens": 754280.0, "reward": 0.4969000220298767, "reward_std": 0.47432881593704224, "rewards/reward_func/mean": 0.4968999922275543, "rewards/reward_func/std": 0.5051693916320801, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5730804577469826, "epoch": 0.31640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21612071990966797, "learning_rate": 4.152631578947369e-06, "loss": -0.0, "num_tokens": 759216.0, "reward": 0.1597999930381775, "reward_std": 0.251237154006958, "rewards/reward_func/mean": 0.1597999930381775, "rewards/reward_func/std": 0.3319000005722046, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9729300402104855, "epoch": 0.318359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.26729077100753784, "learning_rate": 4.147368421052632e-06, "loss": -0.0, "num_tokens": 763944.0, "reward": 0.7394999861717224, "reward_std": 0.30079948902130127, "rewards/reward_func/mean": 0.7394999861717224, "rewards/reward_func/std": 0.4567972421646118, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5482957363128662, "epoch": 0.3203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.19377972185611725, "learning_rate": 4.142105263157895e-06, "loss": 0.0, "num_tokens": 768392.0, "reward": 0.8723000288009644, "reward_std": 0.24825221300125122, "rewards/reward_func/mean": 0.8723000288009644, "rewards/reward_func/std": 0.3525434732437134, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6647583916783333, "epoch": 0.322265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21792468428611755, "learning_rate": 4.136842105263158e-06, "loss": -0.0, "num_tokens": 773112.0, "reward": 0.6197500228881836, "reward_std": 0.2605000138282776, "rewards/reward_func/mean": 0.6197500228881836, "rewards/reward_func/std": 0.5133981704711914, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5656867567449808, "epoch": 0.32421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2745096683502197, "learning_rate": 4.1315789473684216e-06, "loss": 0.0, "num_tokens": 778024.0, "reward": 0.3997125029563904, "reward_std": 0.4995589852333069, "rewards/reward_func/mean": 0.3997125029563904, "rewards/reward_func/std": 0.47963643074035645, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5817390941083431, "epoch": 0.326171875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1972116380929947, "learning_rate": 4.126315789473685e-06, "loss": -0.0, "num_tokens": 782976.0, "reward": 0.0015625000232830644, "reward_std": 0.0031250000465661287, "rewards/reward_func/mean": 0.0015625000232830644, "rewards/reward_func/std": 0.0044194175861775875, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6987337954342365, "epoch": 0.328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20925994217395782, "learning_rate": 4.121052631578948e-06, "loss": -0.0, "num_tokens": 787984.0, "reward": 0.35600000619888306, "reward_std": 0.5124994516372681, "rewards/reward_func/mean": 0.35600000619888306, "rewards/reward_func/std": 0.4916515350341797, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9033612161874771, "epoch": 0.330078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2295723259449005, "learning_rate": 4.115789473684211e-06, "loss": 0.0, "num_tokens": 792344.0, "reward": 0.6353750228881836, "reward_std": 0.5139543414115906, "rewards/reward_func/mean": 0.6353750228881836, "rewards/reward_func/std": 0.4924015998840332, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7016931138932705, "epoch": 0.33203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20966175198554993, "learning_rate": 4.110526315789474e-06, "loss": 0.0, "num_tokens": 797032.0, "reward": 0.5, "reward_std": 0.5, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5345224738121033, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6281918026506901, "epoch": 0.333984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20377099514007568, "learning_rate": 4.105263157894737e-06, "loss": 0.0, "num_tokens": 801688.0, "reward": 0.7515624761581421, "reward_std": 0.49687498807907104, "rewards/reward_func/mean": 0.7515624761581421, "rewards/reward_func/std": 0.4600290060043335, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.757878951728344, "epoch": 0.3359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.25204285979270935, "learning_rate": 4.1e-06, "loss": -0.0, "num_tokens": 806624.0, "reward": 0.24729999899864197, "reward_std": 0.49459999799728394, "rewards/reward_func/mean": 0.24729999899864197, "rewards/reward_func/std": 0.4579470157623291, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5134147731587291, "epoch": 0.337890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.168964684009552, "learning_rate": 4.094736842105264e-06, "loss": -0.0, "num_tokens": 811104.0, "reward": 0.5767999887466431, "reward_std": 0.4959026575088501, "rewards/reward_func/mean": 0.5767999887466431, "rewards/reward_func/std": 0.47768643498420715, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6563267186284065, "epoch": 0.33984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23291705548763275, "learning_rate": 4.089473684210527e-06, "loss": 0.0, "num_tokens": 815760.0, "reward": 0.49806252121925354, "reward_std": 0.49673035740852356, "rewards/reward_func/mean": 0.49806249141693115, "rewards/reward_func/std": 0.5292056798934937, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6437232866883278, "epoch": 0.341796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24242784082889557, "learning_rate": 4.08421052631579e-06, "loss": 0.0, "num_tokens": 820600.0, "reward": 0.45920002460479736, "reward_std": 0.53072190284729, "rewards/reward_func/mean": 0.45920002460479736, "rewards/reward_func/std": 0.49137234687805176, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.44995045475661755, "epoch": 0.34375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20669235289096832, "learning_rate": 4.078947368421053e-06, "loss": 0.0, "num_tokens": 825384.0, "reward": 0.25687500834465027, "reward_std": 0.2894454598426819, "rewards/reward_func/mean": 0.2568749785423279, "rewards/reward_func/std": 0.4227794110774994, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6303725428879261, "epoch": 0.345703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.17360170185565948, "learning_rate": 4.073684210526316e-06, "loss": -0.0, "num_tokens": 830176.0, "reward": 0.24449999630451202, "reward_std": 0.48899999260902405, "rewards/reward_func/mean": 0.24449999630451202, "rewards/reward_func/std": 0.4528787434101105, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5138363922014832, "epoch": 0.34765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23947031795978546, "learning_rate": 4.0684210526315795e-06, "loss": -0.0, "num_tokens": 834968.0, "reward": 0.476812481880188, "reward_std": 0.4751999080181122, "rewards/reward_func/mean": 0.476812481880188, "rewards/reward_func/std": 0.506460964679718, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8445651698857546, "epoch": 0.349609375, "frac_reward_zero_std": 0.5, "grad_norm": 0.2013005018234253, "learning_rate": 4.063157894736842e-06, "loss": -0.0, "num_tokens": 839816.0, "reward": 0.625, "reward_std": 0.25, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6270747724920511, "epoch": 0.3515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2170683890581131, "learning_rate": 4.057894736842106e-06, "loss": -0.0, "num_tokens": 844424.0, "reward": 0.7447500228881836, "reward_std": 0.2991751432418823, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7638631723821163, "epoch": 0.353515625, "frac_reward_zero_std": 0.5, "grad_norm": 0.2188190519809723, "learning_rate": 4.052631578947368e-06, "loss": 0.0, "num_tokens": 848736.0, "reward": 0.8765624761581421, "reward_std": 0.24687500298023224, "rewards/reward_func/mean": 0.8765624761581421, "rewards/reward_func/std": 0.34913399815559387, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6433968283236027, "epoch": 0.35546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21660254895687103, "learning_rate": 4.047368421052632e-06, "loss": 0.0, "num_tokens": 853696.0, "reward": 0.2412624955177307, "reward_std": 0.4451237916946411, "rewards/reward_func/mean": 0.2412625104188919, "rewards/reward_func/std": 0.4122578501701355, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7037541754543781, "epoch": 0.357421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24847397208213806, "learning_rate": 4.042105263157895e-06, "loss": 0.0, "num_tokens": 858184.0, "reward": 0.5062500238418579, "reward_std": 0.4917367100715637, "rewards/reward_func/mean": 0.5062500238418579, "rewards/reward_func/std": 0.5279255509376526, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6735056638717651, "epoch": 0.359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24711540341377258, "learning_rate": 4.0368421052631585e-06, "loss": 0.0, "num_tokens": 862816.0, "reward": 0.49459999799728394, "reward_std": 0.5711829662322998, "rewards/reward_func/mean": 0.49459999799728394, "rewards/reward_func/std": 0.5288126468658447, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.38281242828816175, "epoch": 0.361328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20001716911792755, "learning_rate": 4.031578947368422e-06, "loss": -0.0, "num_tokens": 867320.0, "reward": 0.5843999981880188, "reward_std": 0.504734992980957, "rewards/reward_func/mean": 0.5843999981880188, "rewards/reward_func/std": 0.4840165376663208, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6081971619278193, "epoch": 0.36328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21603205800056458, "learning_rate": 4.026315789473684e-06, "loss": 0.0, "num_tokens": 872280.0, "reward": 0.2266875058412552, "reward_std": 0.2657185196876526, "rewards/reward_func/mean": 0.2266875058412552, "rewards/reward_func/std": 0.40837597846984863, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6896807588636875, "epoch": 0.365234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23351432383060455, "learning_rate": 4.021052631578948e-06, "loss": 0.0, "num_tokens": 876816.0, "reward": 0.38750001788139343, "reward_std": 0.2557178735733032, "rewards/reward_func/mean": 0.38749998807907104, "rewards/reward_func/std": 0.5074445605278015, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6723958812654018, "epoch": 0.3671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2449483573436737, "learning_rate": 4.01578947368421e-06, "loss": 0.0, "num_tokens": 881448.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5192831270396709, "epoch": 0.369140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23788705468177795, "learning_rate": 4.010526315789474e-06, "loss": 0.0, "num_tokens": 886288.0, "reward": 0.2617875039577484, "reward_std": 0.4782521724700928, "rewards/reward_func/mean": 0.2617875039577484, "rewards/reward_func/std": 0.4428786039352417, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5673484560102224, "epoch": 0.37109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2121075987815857, "learning_rate": 4.005263157894737e-06, "loss": -0.0, "num_tokens": 891080.0, "reward": 0.23899999260902405, "reward_std": 0.4779999852180481, "rewards/reward_func/mean": 0.23899999260902405, "rewards/reward_func/std": 0.44254201650619507, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.40696537867188454, "epoch": 0.373046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.13059356808662415, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "num_tokens": 895568.0, "reward": 0.6972000002861023, "reward_std": 0.2683524191379547, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7386405281722546, "epoch": 0.375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24719874560832977, "learning_rate": 3.994736842105264e-06, "loss": 0.0, "num_tokens": 900312.0, "reward": 0.48875001072883606, "reward_std": 0.4433884024620056, "rewards/reward_func/mean": 0.48874998092651367, "rewards/reward_func/std": 0.4832848906517029, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5994266867637634, "epoch": 0.376953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22721026837825775, "learning_rate": 3.989473684210526e-06, "loss": 0.0, "num_tokens": 904664.0, "reward": 0.6390625238418579, "reward_std": 0.517444908618927, "rewards/reward_func/mean": 0.6390625238418579, "rewards/reward_func/std": 0.49845463037490845, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6586835943162441, "epoch": 0.37890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21596045792102814, "learning_rate": 3.98421052631579e-06, "loss": 0.0, "num_tokens": 909648.0, "reward": 0.11412499845027924, "reward_std": 0.2256084382534027, "rewards/reward_func/mean": 0.11412499845027924, "rewards/reward_func/std": 0.3127436935901642, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7914648056030273, "epoch": 0.380859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23387473821640015, "learning_rate": 3.9789473684210525e-06, "loss": -0.0, "num_tokens": 914416.0, "reward": 0.6203374862670898, "reward_std": 0.5114390850067139, "rewards/reward_func/mean": 0.6203374862670898, "rewards/reward_func/std": 0.49055466055870056, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5869229193776846, "epoch": 0.3828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23709914088249207, "learning_rate": 3.9736842105263165e-06, "loss": 0.0, "num_tokens": 919352.0, "reward": 0.5044000148773193, "reward_std": 0.4865538477897644, "rewards/reward_func/mean": 0.5044000148773193, "rewards/reward_func/std": 0.5128971934318542, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.579900648444891, "epoch": 0.384765625, "frac_reward_zero_std": 0.5, "grad_norm": 0.15101991593837738, "learning_rate": 3.968421052631579e-06, "loss": 0.0, "num_tokens": 923976.0, "reward": 0.9973000288009644, "reward_std": 0.0054000020027160645, "rewards/reward_func/mean": 0.9973000288009644, "rewards/reward_func/std": 0.007636756170541048, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8158314451575279, "epoch": 0.38671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21359558403491974, "learning_rate": 3.963157894736843e-06, "loss": 0.0, "num_tokens": 928512.0, "reward": 0.12968750298023224, "reward_std": 0.2552257776260376, "rewards/reward_func/mean": 0.12968750298023224, "rewards/reward_func/std": 0.35177722573280334, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9157693684101105, "epoch": 0.388671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.254290372133255, "learning_rate": 3.957894736842106e-06, "loss": 0.0, "num_tokens": 933008.0, "reward": 0.2562499940395355, "reward_std": 0.49611565470695496, "rewards/reward_func/mean": 0.2562499940395355, "rewards/reward_func/std": 0.4593765139579773, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6207092180848122, "epoch": 0.390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.18421423435211182, "learning_rate": 3.952631578947368e-06, "loss": 0.0, "num_tokens": 937968.0, "reward": 0.2251249998807907, "reward_std": 0.44199562072753906, "rewards/reward_func/mean": 0.2251249998807907, "rewards/reward_func/std": 0.40922626852989197, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7159414663910866, "epoch": 0.392578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21772877871990204, "learning_rate": 3.947368421052632e-06, "loss": 0.0, "num_tokens": 942312.0, "reward": 0.1796875149011612, "reward_std": 0.2510603666305542, "rewards/reward_func/mean": 0.1796875149011612, "rewards/reward_func/std": 0.33246493339538574, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6835194565355778, "epoch": 0.39453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22371238470077515, "learning_rate": 3.942105263157895e-06, "loss": 0.0, "num_tokens": 947480.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7550305463373661, "epoch": 0.396484375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.936842105263159e-06, "loss": 0.0, "num_tokens": 952168.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7681427337229252, "epoch": 0.3984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22429360449314117, "learning_rate": 3.931578947368421e-06, "loss": -0.0, "num_tokens": 956920.0, "reward": 0.4778124988079071, "reward_std": 0.46526336669921875, "rewards/reward_func/mean": 0.4778125286102295, "rewards/reward_func/std": 0.4945225715637207, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7594692595303059, "epoch": 0.400390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24203604459762573, "learning_rate": 3.926315789473685e-06, "loss": 0.0, "num_tokens": 961672.0, "reward": 0.4778124988079071, "reward_std": 0.46526336669921875, "rewards/reward_func/mean": 0.4778125286102295, "rewards/reward_func/std": 0.4945225715637207, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7869609482586384, "epoch": 0.40234375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.921052631578947e-06, "loss": 0.0, "num_tokens": 966320.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4896150715649128, "epoch": 0.404296875, "frac_reward_zero_std": 0.5, "grad_norm": 0.12556888163089752, "learning_rate": 3.9157894736842104e-06, "loss": -0.0, "num_tokens": 971120.0, "reward": 0.11949999630451202, "reward_std": 0.23899999260902405, "rewards/reward_func/mean": 0.11949999630451202, "rewards/reward_func/std": 0.33799704909324646, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5013071876019239, "epoch": 0.40625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20768725872039795, "learning_rate": 3.9105263157894744e-06, "loss": 0.0, "num_tokens": 975976.0, "reward": 0.3636625111103058, "reward_std": 0.5205842852592468, "rewards/reward_func/mean": 0.3636625111103058, "rewards/reward_func/std": 0.49861037731170654, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7933114245533943, "epoch": 0.408203125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.905263157894737e-06, "loss": 0.0, "num_tokens": 980408.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6895494759082794, "epoch": 0.41015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23387707769870758, "learning_rate": 3.900000000000001e-06, "loss": -0.0, "num_tokens": 985104.0, "reward": 0.600100040435791, "reward_std": 0.5192785263061523, "rewards/reward_func/mean": 0.6000999808311462, "rewards/reward_func/std": 0.4977251887321472, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7427148595452309, "epoch": 0.412109375, "frac_reward_zero_std": 0.5, "grad_norm": 0.16066974401474, "learning_rate": 3.894736842105263e-06, "loss": 0.0, "num_tokens": 989448.0, "reward": 0.885937511920929, "reward_std": 0.22812499105930328, "rewards/reward_func/mean": 0.885937511920929, "rewards/reward_func/std": 0.32261747121810913, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7897635065019131, "epoch": 0.4140625, "frac_reward_zero_std": 0.5, "grad_norm": 0.17688219249248505, "learning_rate": 3.889473684210527e-06, "loss": 0.0, "num_tokens": 994096.0, "reward": 0.12386249750852585, "reward_std": 0.24357615411281586, "rewards/reward_func/mean": 0.12386249750852585, "rewards/reward_func/std": 0.3453129827976227, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49311958625912666, "epoch": 0.416015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.17744576930999756, "learning_rate": 3.884210526315789e-06, "loss": 0.0, "num_tokens": 998744.0, "reward": 0.2519875168800354, "reward_std": 0.491636723279953, "rewards/reward_func/mean": 0.251987487077713, "rewards/reward_func/std": 0.4552379846572876, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7735568955540657, "epoch": 0.41796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2734695374965668, "learning_rate": 3.878947368421053e-06, "loss": -0.0, "num_tokens": 1003264.0, "reward": 0.37542498111724854, "reward_std": 0.5312633514404297, "rewards/reward_func/mean": 0.37542498111724854, "rewards/reward_func/std": 0.5113483667373657, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6551911272108555, "epoch": 0.419921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21010510623455048, "learning_rate": 3.873684210526316e-06, "loss": 0.0, "num_tokens": 1008120.0, "reward": 0.6193125247955322, "reward_std": 0.5183161497116089, "rewards/reward_func/mean": 0.6193125247955322, "rewards/reward_func/std": 0.49596303701400757, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6946419551968575, "epoch": 0.421875, "frac_reward_zero_std": 0.5, "grad_norm": 0.18316613137722015, "learning_rate": 3.868421052631579e-06, "loss": 0.0, "num_tokens": 1012760.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.594407208263874, "epoch": 0.423828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20372089743614197, "learning_rate": 3.863157894736843e-06, "loss": 0.0, "num_tokens": 1017496.0, "reward": 0.4871875047683716, "reward_std": 0.5233246684074402, "rewards/reward_func/mean": 0.4871875047683716, "rewards/reward_func/std": 0.48450902104377747, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4554303726181388, "epoch": 0.42578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20890522003173828, "learning_rate": 3.857894736842105e-06, "loss": 0.0, "num_tokens": 1021992.0, "reward": 0.5825625061988831, "reward_std": 0.49896040558815, "rewards/reward_func/mean": 0.5825625061988831, "rewards/reward_func/std": 0.47897282242774963, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9218173250555992, "epoch": 0.427734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22832128405570984, "learning_rate": 3.852631578947369e-06, "loss": 0.0, "num_tokens": 1026488.0, "reward": 0.629687488079071, "reward_std": 0.5293000936508179, "rewards/reward_func/mean": 0.629687488079071, "rewards/reward_func/std": 0.5112107992172241, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6961621716618538, "epoch": 0.4296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2567530870437622, "learning_rate": 3.8473684210526316e-06, "loss": -0.0, "num_tokens": 1031136.0, "reward": 0.4947499930858612, "reward_std": 0.48889586329460144, "rewards/reward_func/mean": 0.4947499930858612, "rewards/reward_func/std": 0.5162643790245056, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49037862569093704, "epoch": 0.431640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21043436229228973, "learning_rate": 3.842105263157895e-06, "loss": -0.0, "num_tokens": 1035784.0, "reward": 0.6168999671936035, "reward_std": 0.5137720108032227, "rewards/reward_func/mean": 0.6168999671936035, "rewards/reward_func/std": 0.49751195311546326, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5241647306829691, "epoch": 0.43359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.189620703458786, "learning_rate": 3.836842105263158e-06, "loss": -0.0, "num_tokens": 1040752.0, "reward": 0.12037500739097595, "reward_std": 0.2284284085035324, "rewards/reward_func/mean": 0.12037499994039536, "rewards/reward_func/std": 0.31062963604927063, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7036157362163067, "epoch": 0.435546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.25706610083580017, "learning_rate": 3.831578947368421e-06, "loss": -0.0, "num_tokens": 1045768.0, "reward": 0.5924999713897705, "reward_std": 0.5081254839897156, "rewards/reward_func/mean": 0.5924999713897705, "rewards/reward_func/std": 0.49127423763275146, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6691739205271006, "epoch": 0.4375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21749237179756165, "learning_rate": 3.826315789473685e-06, "loss": -0.0, "num_tokens": 1050600.0, "reward": 0.6239999532699585, "reward_std": 0.5126060247421265, "rewards/reward_func/mean": 0.6239999532699585, "rewards/reward_func/std": 0.48936182260513306, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6261179707944393, "epoch": 0.439453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20746906101703644, "learning_rate": 3.821052631578947e-06, "loss": 0.0, "num_tokens": 1054944.0, "reward": 0.29374998807907104, "reward_std": 0.4726366698741913, "rewards/reward_func/mean": 0.29375001788139343, "rewards/reward_func/std": 0.4375765025615692, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5851390026509762, "epoch": 0.44140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23050355911254883, "learning_rate": 3.815789473684211e-06, "loss": 0.0, "num_tokens": 1059600.0, "reward": 0.6195999979972839, "reward_std": 0.5338436961174011, "rewards/reward_func/mean": 0.6195999979972839, "rewards/reward_func/std": 0.513155460357666, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 1.0195250883698463, "epoch": 0.443359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2704000473022461, "learning_rate": 3.810526315789474e-06, "loss": 0.0, "num_tokens": 1064080.0, "reward": 0.7473000288009644, "reward_std": 0.2940751314163208, "rewards/reward_func/mean": 0.7473000288009644, "rewards/reward_func/std": 0.4613037705421448, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8138170130550861, "epoch": 0.4453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24457702040672302, "learning_rate": 3.805263157894737e-06, "loss": -0.0, "num_tokens": 1069096.0, "reward": 0.3586999773979187, "reward_std": 0.5152043104171753, "rewards/reward_func/mean": 0.3586999773979187, "rewards/reward_func/std": 0.49505308270454407, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7082953453063965, "epoch": 0.447265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22110335528850555, "learning_rate": 3.8000000000000005e-06, "loss": -0.0, "num_tokens": 1073736.0, "reward": 0.3614499866962433, "reward_std": 0.5230631828308105, "rewards/reward_func/mean": 0.3614499866962433, "rewards/reward_func/std": 0.49916091561317444, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.30846840143203735, "epoch": 0.44921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.1866290271282196, "learning_rate": 3.794736842105263e-06, "loss": 0.0, "num_tokens": 1078248.0, "reward": 0.46480000019073486, "reward_std": 0.5367048382759094, "rewards/reward_func/mean": 0.46480000019073486, "rewards/reward_func/std": 0.49689212441444397, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9043601844459772, "epoch": 0.451171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2427355796098709, "learning_rate": 3.789473684210527e-06, "loss": -0.0, "num_tokens": 1082592.0, "reward": 0.5265624523162842, "reward_std": 0.5467819571495056, "rewards/reward_func/mean": 0.526562511920929, "rewards/reward_func/std": 0.506891667842865, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5939224380999804, "epoch": 0.453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23449236154556274, "learning_rate": 3.7842105263157895e-06, "loss": 0.0, "num_tokens": 1087440.0, "reward": 0.3668999969959259, "reward_std": 0.5270397663116455, "rewards/reward_func/mean": 0.3668999969959259, "rewards/reward_func/std": 0.5063701272010803, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7096200175583363, "epoch": 0.455078125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1598760038614273, "learning_rate": 3.778947368421053e-06, "loss": -0.0, "num_tokens": 1092192.0, "reward": 0.819100022315979, "reward_std": 0.23282161355018616, "rewards/reward_func/mean": 0.819100022315979, "rewards/reward_func/std": 0.33110320568084717, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7353874929249287, "epoch": 0.45703125, "frac_reward_zero_std": 0.5, "grad_norm": 0.17731121182441711, "learning_rate": 3.773684210526316e-06, "loss": 0.0, "num_tokens": 1096536.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8204295337200165, "epoch": 0.458984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19422033429145813, "learning_rate": 3.768421052631579e-06, "loss": 0.0, "num_tokens": 1101328.0, "reward": 0.7335000038146973, "reward_std": 0.45602166652679443, "rewards/reward_func/mean": 0.7335000038146973, "rewards/reward_func/std": 0.4227708876132965, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6538256984204054, "epoch": 0.4609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.35198187828063965, "learning_rate": 3.7631578947368426e-06, "loss": 0.0, "num_tokens": 1105976.0, "reward": 0.4843499958515167, "reward_std": 0.4797285199165344, "rewards/reward_func/mean": 0.4843499958515167, "rewards/reward_func/std": 0.5180744528770447, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.37019990384578705, "epoch": 0.462890625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.7578947368421053e-06, "loss": 0.0, "num_tokens": 1110480.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6362588629126549, "epoch": 0.46484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.190359964966774, "learning_rate": 3.752631578947369e-06, "loss": 0.0, "num_tokens": 1115256.0, "reward": 0.249937504529953, "reward_std": 0.47089433670043945, "rewards/reward_func/mean": 0.2499374896287918, "rewards/reward_func/std": 0.436068594455719, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.914298091083765, "epoch": 0.466796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2647916376590729, "learning_rate": 3.7473684210526317e-06, "loss": 0.0, "num_tokens": 1119752.0, "reward": 0.7519875168800354, "reward_std": 0.4888792634010315, "rewards/reward_func/mean": 0.7519875168800354, "rewards/reward_func/std": 0.45273634791374207, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.516225041821599, "epoch": 0.46875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2023496925830841, "learning_rate": 3.7421052631578953e-06, "loss": -0.0, "num_tokens": 1124736.0, "reward": 0.11806250363588333, "reward_std": 0.21600480377674103, "rewards/reward_func/mean": 0.11806250363588333, "rewards/reward_func/std": 0.28930220007896423, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6972608156502247, "epoch": 0.470703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22246108949184418, "learning_rate": 3.736842105263158e-06, "loss": 0.0, "num_tokens": 1129560.0, "reward": 0.2282000035047531, "reward_std": 0.4564000070095062, "rewards/reward_func/mean": 0.2282000035047531, "rewards/reward_func/std": 0.42270201444625854, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7819927409291267, "epoch": 0.47265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21443378925323486, "learning_rate": 3.7315789473684216e-06, "loss": 0.0, "num_tokens": 1134352.0, "reward": 0.36834999918937683, "reward_std": 0.5177301168441772, "rewards/reward_func/mean": 0.36834999918937683, "rewards/reward_func/std": 0.49500060081481934, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5241367612034082, "epoch": 0.474609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24711766839027405, "learning_rate": 3.7263157894736848e-06, "loss": 0.0, "num_tokens": 1139000.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9131961166858673, "epoch": 0.4765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3064594864845276, "learning_rate": 3.7210526315789475e-06, "loss": -0.0, "num_tokens": 1143488.0, "reward": 0.8697500228881836, "reward_std": 0.2605000138282776, "rewards/reward_func/mean": 0.8697500228881836, "rewards/reward_func/std": 0.3517392575740814, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.766132302582264, "epoch": 0.478515625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.715789473684211e-06, "loss": 0.0, "num_tokens": 1148352.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6391206979751587, "epoch": 0.48046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24102520942687988, "learning_rate": 3.710526315789474e-06, "loss": -0.0, "num_tokens": 1153104.0, "reward": 0.819100022315979, "reward_std": 0.241799995303154, "rewards/reward_func/mean": 0.819100022315979, "rewards/reward_func/std": 0.33110323548316956, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5933511331677437, "epoch": 0.482421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22239266335964203, "learning_rate": 3.7052631578947374e-06, "loss": 0.0, "num_tokens": 1158072.0, "reward": 0.36543750762939453, "reward_std": 0.2436250001192093, "rewards/reward_func/mean": 0.36543750762939453, "rewards/reward_func/std": 0.4533359408378601, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47794661670923233, "epoch": 0.484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21475116908550262, "learning_rate": 3.7e-06, "loss": 0.0, "num_tokens": 1162872.0, "reward": 0.5974999666213989, "reward_std": 0.5149734020233154, "rewards/reward_func/mean": 0.5974999666213989, "rewards/reward_func/std": 0.49477699398994446, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9037050679326057, "epoch": 0.486328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2283296287059784, "learning_rate": 3.6947368421052637e-06, "loss": -0.0, "num_tokens": 1167368.0, "reward": 0.3828125, "reward_std": 0.529944896697998, "rewards/reward_func/mean": 0.3828125, "rewards/reward_func/std": 0.5115163922309875, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7908897437155247, "epoch": 0.48828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21019713580608368, "learning_rate": 3.6894736842105265e-06, "loss": 0.0, "num_tokens": 1172112.0, "reward": 0.8685125112533569, "reward_std": 0.2369532436132431, "rewards/reward_func/mean": 0.8685125112533569, "rewards/reward_func/std": 0.32584938406944275, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6358069218695164, "epoch": 0.490234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2785353660583496, "learning_rate": 3.6842105263157896e-06, "loss": 0.0, "num_tokens": 1176896.0, "reward": 0.6227250099182129, "reward_std": 0.5166673064231873, "rewards/reward_func/mean": 0.6227250099182129, "rewards/reward_func/std": 0.49570804834365845, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.38082710839807987, "epoch": 0.4921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20742739737033844, "learning_rate": 3.6789473684210532e-06, "loss": -0.0, "num_tokens": 1181544.0, "reward": 0.4947499930858612, "reward_std": 0.571418046951294, "rewards/reward_func/mean": 0.4947499930858612, "rewards/reward_func/std": 0.5290886163711548, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4866997255012393, "epoch": 0.494140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.17461475729942322, "learning_rate": 3.673684210526316e-06, "loss": 0.0, "num_tokens": 1186056.0, "reward": 0.6909000277519226, "reward_std": 0.46066808700561523, "rewards/reward_func/mean": 0.6909000277519226, "rewards/reward_func/std": 0.42650365829467773, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6999755539000034, "epoch": 0.49609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22714459896087646, "learning_rate": 3.6684210526315796e-06, "loss": 0.0, "num_tokens": 1191056.0, "reward": 0.34780001640319824, "reward_std": 0.5009496808052063, "rewards/reward_func/mean": 0.34780001640319824, "rewards/reward_func/std": 0.4800656735897064, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5954524613916874, "epoch": 0.498046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2830187678337097, "learning_rate": 3.6631578947368423e-06, "loss": 0.0, "num_tokens": 1195904.0, "reward": 0.34780001640319824, "reward_std": 0.5050222873687744, "rewards/reward_func/mean": 0.34780001640319824, "rewards/reward_func/std": 0.4806229770183563, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.622910525649786, "epoch": 0.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.1351393461227417, "learning_rate": 3.657894736842106e-06, "loss": 0.0, "num_tokens": 1200880.0, "reward": 0.3295000195503235, "reward_std": 0.2197657823562622, "rewards/reward_func/mean": 0.3295000195503235, "rewards/reward_func/std": 0.45483532547950745, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6488723643124104, "epoch": 0.501953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21160662174224854, "learning_rate": 3.6526315789473686e-06, "loss": -0.0, "num_tokens": 1205728.0, "reward": 0.267612487077713, "reward_std": 0.48172545433044434, "rewards/reward_func/mean": 0.267612487077713, "rewards/reward_func/std": 0.4461328089237213, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9895205125212669, "epoch": 0.50390625, "frac_reward_zero_std": 0.5, "grad_norm": 0.18444561958312988, "learning_rate": 3.6473684210526318e-06, "loss": 0.0, "num_tokens": 1210304.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8820672258734703, "epoch": 0.505859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24869714677333832, "learning_rate": 3.642105263157895e-06, "loss": -0.0, "num_tokens": 1215008.0, "reward": 0.9872499704360962, "reward_std": 0.025499999523162842, "rewards/reward_func/mean": 0.9872499704360962, "rewards/reward_func/std": 0.024093568325042725, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6377357617020607, "epoch": 0.5078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22434614598751068, "learning_rate": 3.636842105263158e-06, "loss": 0.0, "num_tokens": 1219872.0, "reward": 0.6390625238418579, "reward_std": 0.518474817276001, "rewards/reward_func/mean": 0.6390625238418579, "rewards/reward_func/std": 0.4981410801410675, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8828465715050697, "epoch": 0.509765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2496601790189743, "learning_rate": 3.6315789473684217e-06, "loss": 0.0, "num_tokens": 1224624.0, "reward": 0.4991750121116638, "reward_std": 0.5548579692840576, "rewards/reward_func/mean": 0.4991750121116638, "rewards/reward_func/std": 0.5141252875328064, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9650222808122635, "epoch": 0.51171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2566354274749756, "learning_rate": 3.6263157894736844e-06, "loss": 0.0, "num_tokens": 1229096.0, "reward": 0.37812501192092896, "reward_std": 0.5366618037223816, "rewards/reward_func/mean": 0.37812501192092896, "rewards/reward_func/std": 0.5150308012962341, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.965212844312191, "epoch": 0.513671875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1699792593717575, "learning_rate": 3.621052631578948e-06, "loss": -0.0, "num_tokens": 1233576.0, "reward": 0.754687488079071, "reward_std": 0.2833658754825592, "rewards/reward_func/mean": 0.754687488079071, "rewards/reward_func/std": 0.45434102416038513, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6129315309226513, "epoch": 0.515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19248241186141968, "learning_rate": 3.6157894736842108e-06, "loss": 0.0, "num_tokens": 1238552.0, "reward": 0.22981250286102295, "reward_std": 0.43893176317214966, "rewards/reward_func/mean": 0.22981250286102295, "rewards/reward_func/std": 0.40648478269577026, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7329412698745728, "epoch": 0.517578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20942990481853485, "learning_rate": 3.610526315789474e-06, "loss": -0.0, "num_tokens": 1243528.0, "reward": 0.015625, "reward_std": 0.02392766997218132, "rewards/reward_func/mean": 0.015625, "rewards/reward_func/std": 0.0265165064483881, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5853278152644634, "epoch": 0.51953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.19774886965751648, "learning_rate": 3.605263157894737e-06, "loss": 0.0, "num_tokens": 1248192.0, "reward": 0.49303752183914185, "reward_std": 0.5586864948272705, "rewards/reward_func/mean": 0.49303752183914185, "rewards/reward_func/std": 0.5173379778862, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7437873594462872, "epoch": 0.521484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.18486569821834564, "learning_rate": 3.6000000000000003e-06, "loss": 0.0, "num_tokens": 1253032.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6368556562811136, "epoch": 0.5234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21867771446704865, "learning_rate": 3.5947368421052634e-06, "loss": 0.0, "num_tokens": 1258256.0, "reward": 0.5062500238418579, "reward_std": 0.49611562490463257, "rewards/reward_func/mean": 0.5062500238418579, "rewards/reward_func/std": 0.5280946493148804, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6667620614171028, "epoch": 0.525390625, "frac_reward_zero_std": 0.5, "grad_norm": 0.16187067329883575, "learning_rate": 3.5894736842105266e-06, "loss": 0.0, "num_tokens": 1263120.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5582370385527611, "epoch": 0.52734375, "frac_reward_zero_std": 0.5, "grad_norm": 0.16511990129947662, "learning_rate": 3.58421052631579e-06, "loss": 0.0, "num_tokens": 1267480.0, "reward": 0.8843749761581421, "reward_std": 0.23125000298023224, "rewards/reward_func/mean": 0.8843749761581421, "rewards/reward_func/std": 0.32703688740730286, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49717821925878525, "epoch": 0.529296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19459761679172516, "learning_rate": 3.578947368421053e-06, "loss": 0.0, "num_tokens": 1272136.0, "reward": 0.6297374963760376, "reward_std": 0.5136181712150574, "rewards/reward_func/mean": 0.6297374963760376, "rewards/reward_func/std": 0.49753350019454956, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47147443797439337, "epoch": 0.53125, "frac_reward_zero_std": 0.0, "grad_norm": 0.17289377748966217, "learning_rate": 3.5736842105263157e-06, "loss": -0.0, "num_tokens": 1276648.0, "reward": 0.6951000094413757, "reward_std": 0.4634339511394501, "rewards/reward_func/mean": 0.6951000094413757, "rewards/reward_func/std": 0.42906418442726135, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6894197128713131, "epoch": 0.533203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24948523938655853, "learning_rate": 3.5684210526315792e-06, "loss": -0.0, "num_tokens": 1281440.0, "reward": 0.7309999465942383, "reward_std": 0.2882004380226135, "rewards/reward_func/mean": 0.7309999465942383, "rewards/reward_func/std": 0.4513968229293823, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8942212909460068, "epoch": 0.53515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.9948049783706665, "learning_rate": 3.5631578947368424e-06, "loss": 0.0, "num_tokens": 1286176.0, "reward": 0.7289999723434448, "reward_std": 0.48616960644721985, "rewards/reward_func/mean": 0.7289999723434448, "rewards/reward_func/std": 0.45014360547065735, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8072192035615444, "epoch": 0.537109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.5503926873207092, "learning_rate": 3.5578947368421056e-06, "loss": 0.0, "num_tokens": 1290680.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6735758259892464, "epoch": 0.5390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20834791660308838, "learning_rate": 3.5526315789473687e-06, "loss": 0.0, "num_tokens": 1295328.0, "reward": 0.4961625039577484, "reward_std": 0.4955860376358032, "rewards/reward_func/mean": 0.4961625039577484, "rewards/reward_func/std": 0.5272848010063171, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6799687743186951, "epoch": 0.541015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.26546961069107056, "learning_rate": 3.5473684210526323e-06, "loss": 0.0, "num_tokens": 1300344.0, "reward": 0.23640000820159912, "reward_std": 0.47280001640319824, "rewards/reward_func/mean": 0.23640000820159912, "rewards/reward_func/std": 0.43869248032569885, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8527603708207607, "epoch": 0.54296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.239075168967247, "learning_rate": 3.542105263157895e-06, "loss": 0.0, "num_tokens": 1304872.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47294606268405914, "epoch": 0.544921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.208218514919281, "learning_rate": 3.536842105263158e-06, "loss": -0.0, "num_tokens": 1309520.0, "reward": 0.3731499910354614, "reward_std": 0.5254724621772766, "rewards/reward_func/mean": 0.3731499910354614, "rewards/reward_func/std": 0.5016124248504639, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6301430575549603, "epoch": 0.546875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1511426568031311, "learning_rate": 3.5315789473684214e-06, "loss": 0.0, "num_tokens": 1314368.0, "reward": 0.22830000519752502, "reward_std": 0.26377108693122864, "rewards/reward_func/mean": 0.22830000519752502, "rewards/reward_func/std": 0.4228929877281189, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6302942372858524, "epoch": 0.548828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2503369152545929, "learning_rate": 3.5263157894736846e-06, "loss": 0.0, "num_tokens": 1319120.0, "reward": 0.6139749884605408, "reward_std": 0.4990615248680115, "rewards/reward_func/mean": 0.6139749884605408, "rewards/reward_func/std": 0.48867690563201904, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5613374076783657, "epoch": 0.55078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21636393666267395, "learning_rate": 3.5210526315789477e-06, "loss": 0.0, "num_tokens": 1323984.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5965826436877251, "epoch": 0.552734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22059328854084015, "learning_rate": 3.515789473684211e-06, "loss": -0.0, "num_tokens": 1328328.0, "reward": 0.7463124990463257, "reward_std": 0.29738226532936096, "rewards/reward_func/mean": 0.7463124990463257, "rewards/reward_func/std": 0.45701852440834045, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6045653857290745, "epoch": 0.5546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2106325626373291, "learning_rate": 3.510526315789474e-06, "loss": 0.0, "num_tokens": 1332984.0, "reward": 0.48653751611709595, "reward_std": 0.48146504163742065, "rewards/reward_func/mean": 0.48653751611709595, "rewards/reward_func/std": 0.5103995203971863, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7015610095113516, "epoch": 0.556640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.16994379460811615, "learning_rate": 3.505263157894737e-06, "loss": 0.0, "num_tokens": 1337928.0, "reward": 0.12349999696016312, "reward_std": 0.23874559998512268, "rewards/reward_func/mean": 0.12350000441074371, "rewards/reward_func/std": 0.31001752614974976, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6296170633286238, "epoch": 0.55859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.216505229473114, "learning_rate": 3.5e-06, "loss": 0.0, "num_tokens": 1342712.0, "reward": 0.4699999988079071, "reward_std": 0.542709231376648, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.5024511218070984, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6994357369840145, "epoch": 0.560546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2235638052225113, "learning_rate": 3.4947368421052635e-06, "loss": 0.0, "num_tokens": 1347360.0, "reward": 0.4905624985694885, "reward_std": 0.5631812810897827, "rewards/reward_func/mean": 0.4905625283718109, "rewards/reward_func/std": 0.5214440822601318, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7445014677941799, "epoch": 0.5625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21883165836334229, "learning_rate": 3.4894736842105263e-06, "loss": -0.0, "num_tokens": 1352008.0, "reward": 0.8697500228881836, "reward_std": 0.2605000138282776, "rewards/reward_func/mean": 0.8697500228881836, "rewards/reward_func/std": 0.3517392575740814, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.48259856551885605, "epoch": 0.564453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20782361924648285, "learning_rate": 3.48421052631579e-06, "loss": -0.0, "num_tokens": 1356928.0, "reward": 0.3723375201225281, "reward_std": 0.5098629593849182, "rewards/reward_func/mean": 0.3723375201225281, "rewards/reward_func/std": 0.4901200532913208, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6990231722593307, "epoch": 0.56640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22484789788722992, "learning_rate": 3.478947368421053e-06, "loss": 0.0, "num_tokens": 1361680.0, "reward": 0.12812499701976776, "reward_std": 0.2562499940395355, "rewards/reward_func/mean": 0.12812499701976776, "rewards/reward_func/std": 0.3523993194103241, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5128569137305021, "epoch": 0.568359375, "frac_reward_zero_std": 0.5, "grad_norm": 0.14201033115386963, "learning_rate": 3.473684210526316e-06, "loss": -0.0, "num_tokens": 1366648.0, "reward": 0.11256249994039536, "reward_std": 0.22097797691822052, "rewards/reward_func/mean": 0.11256249994039536, "rewards/reward_func/std": 0.31335464119911194, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7180933952331543, "epoch": 0.5703125, "frac_reward_zero_std": 0.5, "grad_norm": 0.2024068832397461, "learning_rate": 3.4684210526315794e-06, "loss": 0.0, "num_tokens": 1371400.0, "reward": 0.8334375023841858, "reward_std": 0.21312500536441803, "rewards/reward_func/mean": 0.8334375023841858, "rewards/reward_func/std": 0.30140429735183716, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5293250977993011, "epoch": 0.572265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24872228503227234, "learning_rate": 3.463157894736842e-06, "loss": 0.0, "num_tokens": 1376624.0, "reward": 0.5025625228881836, "reward_std": 0.4810871481895447, "rewards/reward_func/mean": 0.5025625228881836, "rewards/reward_func/std": 0.5211412906646729, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5247739069163799, "epoch": 0.57421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.25173091888427734, "learning_rate": 3.4578947368421057e-06, "loss": 0.0, "num_tokens": 1381472.0, "reward": 0.5079500079154968, "reward_std": 0.5440911054611206, "rewards/reward_func/mean": 0.5079500079154968, "rewards/reward_func/std": 0.5037304759025574, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.59282411262393, "epoch": 0.576171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21082493662834167, "learning_rate": 3.4526315789473684e-06, "loss": -0.0, "num_tokens": 1386312.0, "reward": 0.6282625198364258, "reward_std": 0.5091252326965332, "rewards/reward_func/mean": 0.6282625198364258, "rewards/reward_func/std": 0.489662766456604, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6454604081809521, "epoch": 0.578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23169437050819397, "learning_rate": 3.447368421052632e-06, "loss": 0.0, "num_tokens": 1390640.0, "reward": 0.8723000288009644, "reward_std": 0.25540000200271606, "rewards/reward_func/mean": 0.8723000288009644, "rewards/reward_func/std": 0.3525434732437134, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.521615531295538, "epoch": 0.580078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23611055314540863, "learning_rate": 3.4421052631578947e-06, "loss": 0.0, "num_tokens": 1395296.0, "reward": 0.24806250631809235, "reward_std": 0.4919757843017578, "rewards/reward_func/mean": 0.24806249141693115, "rewards/reward_func/std": 0.4555468261241913, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7950318790972233, "epoch": 0.58203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.30078282952308655, "learning_rate": 3.4368421052631583e-06, "loss": -0.0, "num_tokens": 1399824.0, "reward": 0.5103750228881836, "reward_std": 0.5537556409835815, "rewards/reward_func/mean": 0.5103750228881836, "rewards/reward_func/std": 0.5127608180046082, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5492544081062078, "epoch": 0.583984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2188585102558136, "learning_rate": 3.4315789473684215e-06, "loss": 0.0, "num_tokens": 1404600.0, "reward": 0.25306248664855957, "reward_std": 0.2876826226711273, "rewards/reward_func/mean": 0.25306248664855957, "rewards/reward_func/std": 0.4343123733997345, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9911265224218369, "epoch": 0.5859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.27803367376327515, "learning_rate": 3.4263157894736842e-06, "loss": 0.0, "num_tokens": 1409072.0, "reward": 0.6269875168800354, "reward_std": 0.5301945805549622, "rewards/reward_func/mean": 0.6269875168800354, "rewards/reward_func/std": 0.5088964104652405, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5954888928681612, "epoch": 0.587890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20479944348335266, "learning_rate": 3.421052631578948e-06, "loss": -0.0, "num_tokens": 1414032.0, "reward": 0.23137500882148743, "reward_std": 0.43826958537101746, "rewards/reward_func/mean": 0.23137500882148743, "rewards/reward_func/std": 0.40582817792892456, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.521581256762147, "epoch": 0.58984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2192363440990448, "learning_rate": 3.4157894736842106e-06, "loss": 0.0, "num_tokens": 1418824.0, "reward": 0.36943748593330383, "reward_std": 0.5057978630065918, "rewards/reward_func/mean": 0.36943748593330383, "rewards/reward_func/std": 0.48606905341148376, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.41881873086094856, "epoch": 0.591796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19346477091312408, "learning_rate": 3.410526315789474e-06, "loss": 0.0, "num_tokens": 1423616.0, "reward": 0.47993749380111694, "reward_std": 0.4731535315513611, "rewards/reward_func/mean": 0.47993749380111694, "rewards/reward_func/std": 0.5031650066375732, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.48736816458404064, "epoch": 0.59375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1667778193950653, "learning_rate": 3.405263157894737e-06, "loss": 0.0, "num_tokens": 1428288.0, "reward": 0.6197500228881836, "reward_std": 0.2395000010728836, "rewards/reward_func/mean": 0.6197500228881836, "rewards/reward_func/std": 0.5133981704711914, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8001267649233341, "epoch": 0.595703125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1438862383365631, "learning_rate": 3.4000000000000005e-06, "loss": -0.0, "num_tokens": 1432912.0, "reward": 0.8511874675750732, "reward_std": 0.23108144104480743, "rewards/reward_func/mean": 0.8511874675750732, "rewards/reward_func/std": 0.3289700150489807, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7933619171380997, "epoch": 0.59765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22013674676418304, "learning_rate": 3.3947368421052636e-06, "loss": 0.0, "num_tokens": 1437576.0, "reward": 0.37968748807907104, "reward_std": 0.5357083082199097, "rewards/reward_func/mean": 0.37968748807907104, "rewards/reward_func/std": 0.5138239860534668, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9326525256037712, "epoch": 0.599609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2728516757488251, "learning_rate": 3.3894736842105264e-06, "loss": -0.0, "num_tokens": 1442096.0, "reward": 0.511787474155426, "reward_std": 0.009360386058688164, "rewards/reward_func/mean": 0.5117875337600708, "rewards/reward_func/std": 0.5104570984840393, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.867360845208168, "epoch": 0.6015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2490600049495697, "learning_rate": 3.38421052631579e-06, "loss": 0.0, "num_tokens": 1446840.0, "reward": 0.49105000495910645, "reward_std": 0.4941999316215515, "rewards/reward_func/mean": 0.49105000495910645, "rewards/reward_func/std": 0.5251454710960388, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4640714302659035, "epoch": 0.603515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20176871120929718, "learning_rate": 3.3789473684210527e-06, "loss": -0.0, "num_tokens": 1451328.0, "reward": 0.5872499942779541, "reward_std": 0.4882524013519287, "rewards/reward_func/mean": 0.5872499942779541, "rewards/reward_func/std": 0.4727397859096527, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4609074145555496, "epoch": 0.60546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19601713120937347, "learning_rate": 3.3736842105263163e-06, "loss": 0.0, "num_tokens": 1456128.0, "reward": 0.4829750061035156, "reward_std": 0.4673447012901306, "rewards/reward_func/mean": 0.4829750061035156, "rewards/reward_func/std": 0.4967931807041168, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7879729568958282, "epoch": 0.607421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2218167930841446, "learning_rate": 3.368421052631579e-06, "loss": 0.0, "num_tokens": 1460920.0, "reward": 0.9754500389099121, "reward_std": 0.02152196317911148, "rewards/reward_func/mean": 0.9754499793052673, "rewards/reward_func/std": 0.020223822444677353, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4728757180273533, "epoch": 0.609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21725128591060638, "learning_rate": 3.3631578947368426e-06, "loss": -0.0, "num_tokens": 1465560.0, "reward": 0.37002500891685486, "reward_std": 0.25100889801979065, "rewards/reward_func/mean": 0.37002497911453247, "rewards/reward_func/std": 0.5039855241775513, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5092521663755178, "epoch": 0.611328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.19549833238124847, "learning_rate": 3.3578947368421054e-06, "loss": 0.0, "num_tokens": 1470360.0, "reward": 0.47251251339912415, "reward_std": 0.5420471429824829, "rewards/reward_func/mean": 0.47251251339912415, "rewards/reward_func/std": 0.5018724799156189, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7892782241106033, "epoch": 0.61328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20750151574611664, "learning_rate": 3.3526315789473685e-06, "loss": 0.0, "num_tokens": 1474848.0, "reward": 0.628125011920929, "reward_std": 0.5351123809814453, "rewards/reward_func/mean": 0.628125011920929, "rewards/reward_func/std": 0.5132942199707031, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8311872594058514, "epoch": 0.615234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2424483448266983, "learning_rate": 3.347368421052632e-06, "loss": 0.0, "num_tokens": 1479344.0, "reward": 0.7445999979972839, "reward_std": 0.49650442600250244, "rewards/reward_func/mean": 0.7445999979972839, "rewards/reward_func/std": 0.45967379212379456, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6033541187644005, "epoch": 0.6171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24076202511787415, "learning_rate": 3.342105263157895e-06, "loss": 0.0, "num_tokens": 1484192.0, "reward": 0.350600004196167, "reward_std": 0.5059750080108643, "rewards/reward_func/mean": 0.350600004196167, "rewards/reward_func/std": 0.48401686549186707, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7708289846777916, "epoch": 0.619140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24534860253334045, "learning_rate": 3.3368421052631584e-06, "loss": 0.0, "num_tokens": 1489200.0, "reward": 0.350600004196167, "reward_std": 0.4990043640136719, "rewards/reward_func/mean": 0.350600004196167, "rewards/reward_func/std": 0.4842973053455353, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7339390330016613, "epoch": 0.62109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24659650027751923, "learning_rate": 3.331578947368421e-06, "loss": -0.0, "num_tokens": 1493688.0, "reward": 0.518750011920929, "reward_std": 0.4794538617134094, "rewards/reward_func/mean": 0.5187499523162842, "rewards/reward_func/std": 0.514738142490387, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8104076609015465, "epoch": 0.623046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.17738322913646698, "learning_rate": 3.3263157894736848e-06, "loss": 0.0, "num_tokens": 1498304.0, "reward": 0.7335000038146973, "reward_std": 0.28232428431510925, "rewards/reward_func/mean": 0.7335000038146973, "rewards/reward_func/std": 0.4527260363101959, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3726614834740758, "epoch": 0.625, "frac_reward_zero_std": 0.0, "grad_norm": 0.17689530551433563, "learning_rate": 3.3210526315789475e-06, "loss": 0.0, "num_tokens": 1502784.0, "reward": 0.8113000392913818, "reward_std": 0.23659999668598175, "rewards/reward_func/mean": 0.8113000392913818, "rewards/reward_func/std": 0.32786741852760315, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8716227933764458, "epoch": 0.626953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.25260844826698303, "learning_rate": 3.3157894736842107e-06, "loss": 0.0, "num_tokens": 1507304.0, "reward": 0.9945999979972839, "reward_std": 0.010800004005432129, "rewards/reward_func/mean": 0.9945999979972839, "rewards/reward_func/std": 0.009998860768973827, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5032209325581789, "epoch": 0.62890625, "frac_reward_zero_std": 0.5, "grad_norm": 0.11165645718574524, "learning_rate": 3.310526315789474e-06, "loss": 0.0, "num_tokens": 1512264.0, "reward": 0.1237500011920929, "reward_std": 0.23090852797031403, "rewards/reward_func/mean": 0.1237500011920929, "rewards/reward_func/std": 0.3300081193447113, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6931778788566589, "epoch": 0.630859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22502507269382477, "learning_rate": 3.305263157894737e-06, "loss": -0.0, "num_tokens": 1517064.0, "reward": 0.35860002040863037, "reward_std": 0.5156620740890503, "rewards/reward_func/mean": 0.358599990606308, "rewards/reward_func/std": 0.49546343088150024, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5616287402808666, "epoch": 0.6328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22584268450737, "learning_rate": 3.3000000000000006e-06, "loss": 0.0, "num_tokens": 1522032.0, "reward": 0.2150000035762787, "reward_std": 0.4300000071525574, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.39838388562202454, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6828614771366119, "epoch": 0.634765625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1657702773809433, "learning_rate": 3.2947368421052633e-06, "loss": 0.0, "num_tokens": 1526888.0, "reward": 0.878125011920929, "reward_std": 0.24375002086162567, "rewards/reward_func/mean": 0.878125011920929, "rewards/reward_func/std": 0.34471458196640015, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4955676291137934, "epoch": 0.63671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23417575657367706, "learning_rate": 3.289473684210527e-06, "loss": -0.0, "num_tokens": 1531496.0, "reward": 0.7447500228881836, "reward_std": 0.2991751432418823, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8880259320139885, "epoch": 0.638671875, "frac_reward_zero_std": 0.5, "grad_norm": 0.16445371508598328, "learning_rate": 3.2842105263157897e-06, "loss": 0.0, "num_tokens": 1535984.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7877190988510847, "epoch": 0.640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24739286303520203, "learning_rate": 3.278947368421053e-06, "loss": 0.0, "num_tokens": 1540744.0, "reward": 0.6195374727249146, "reward_std": 0.5129984021186829, "rewards/reward_func/mean": 0.6195374727249146, "rewards/reward_func/std": 0.48989206552505493, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7118626944720745, "epoch": 0.642578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21109792590141296, "learning_rate": 3.273684210526316e-06, "loss": 0.0, "num_tokens": 1545728.0, "reward": 0.3330000042915344, "reward_std": 0.47834351658821106, "rewards/reward_func/mean": 0.3330000042915344, "rewards/reward_func/std": 0.45958366990089417, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7776865549385548, "epoch": 0.64453125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.268421052631579e-06, "loss": 0.0, "num_tokens": 1550560.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8135379999876022, "epoch": 0.646484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.17627958953380585, "learning_rate": 3.2631578947368423e-06, "loss": 0.0, "num_tokens": 1555040.0, "reward": 0.878125011920929, "reward_std": 0.24375002086162567, "rewards/reward_func/mean": 0.878125011920929, "rewards/reward_func/std": 0.34471458196640015, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6435816325247288, "epoch": 0.6484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.17974931001663208, "learning_rate": 3.2578947368421055e-06, "loss": 0.0, "num_tokens": 1559904.0, "reward": 0.7515624761581421, "reward_std": 0.2868822515010834, "rewards/reward_func/mean": 0.7515624761581421, "rewards/reward_func/std": 0.4600290060043335, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.39286974538117647, "epoch": 0.650390625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.252631578947369e-06, "loss": 0.0, "num_tokens": 1564416.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6735682226717472, "epoch": 0.65234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20429600775241852, "learning_rate": 3.247368421052632e-06, "loss": 0.0, "num_tokens": 1569208.0, "reward": 0.6209875345230103, "reward_std": 0.4939994215965271, "rewards/reward_func/mean": 0.6209875345230103, "rewards/reward_func/std": 0.4771609306335449, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6759085133671761, "epoch": 0.654296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2895135283470154, "learning_rate": 3.2421052631578945e-06, "loss": 0.0, "num_tokens": 1573728.0, "reward": 0.7593749761581421, "reward_std": 0.48125001788139343, "rewards/reward_func/mean": 0.7593749761581421, "rewards/reward_func/std": 0.44560104608535767, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.45824577659368515, "epoch": 0.65625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22657960653305054, "learning_rate": 3.236842105263158e-06, "loss": -0.0, "num_tokens": 1578240.0, "reward": 0.6972000002861023, "reward_std": 0.46480000019073486, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7448481060564518, "epoch": 0.658203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.17294245958328247, "learning_rate": 3.2315789473684213e-06, "loss": -0.0, "num_tokens": 1583088.0, "reward": 0.11026249825954437, "reward_std": 0.21637839078903198, "rewards/reward_func/mean": 0.11026249825954437, "rewards/reward_func/std": 0.3068498373031616, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.48510024510324, "epoch": 0.66015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20591919124126434, "learning_rate": 3.2263157894736845e-06, "loss": 0.0, "num_tokens": 1587872.0, "reward": 0.484250009059906, "reward_std": 0.5447538495063782, "rewards/reward_func/mean": 0.4842499792575836, "rewards/reward_func/std": 0.5043662190437317, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49838624335825443, "epoch": 0.662109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18662779033184052, "learning_rate": 3.2210526315789476e-06, "loss": 0.0, "num_tokens": 1592672.0, "reward": 0.8531500101089478, "reward_std": 0.2557721734046936, "rewards/reward_func/mean": 0.8531500101089478, "rewards/reward_func/std": 0.3453153371810913, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7124339491128922, "epoch": 0.6640625, "frac_reward_zero_std": 0.5, "grad_norm": 0.23854242265224457, "learning_rate": 3.215789473684211e-06, "loss": -0.0, "num_tokens": 1597416.0, "reward": 0.8318749666213989, "reward_std": 0.2162500023841858, "rewards/reward_func/mean": 0.8318749666213989, "rewards/reward_func/std": 0.3058236837387085, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8913617357611656, "epoch": 0.666015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.28833919763565063, "learning_rate": 3.210526315789474e-06, "loss": 0.0, "num_tokens": 1601880.0, "reward": 0.8769874572753906, "reward_std": 0.2460249960422516, "rewards/reward_func/mean": 0.8769875168800354, "rewards/reward_func/std": 0.3392883837223053, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.48491984978318214, "epoch": 0.66796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2013910710811615, "learning_rate": 3.2052631578947367e-06, "loss": -0.0, "num_tokens": 1606664.0, "reward": 0.6037999987602234, "reward_std": 0.5043159127235413, "rewards/reward_func/mean": 0.6037999391555786, "rewards/reward_func/std": 0.4866776764392853, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.837503544986248, "epoch": 0.669921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2542946934700012, "learning_rate": 3.2000000000000003e-06, "loss": 0.0, "num_tokens": 1611408.0, "reward": 0.7344000339508057, "reward_std": 0.48996883630752563, "rewards/reward_func/mean": 0.7344000339508057, "rewards/reward_func/std": 0.45366016030311584, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7475596591830254, "epoch": 0.671875, "frac_reward_zero_std": 0.5, "grad_norm": 0.15543855726718903, "learning_rate": 3.1947368421052634e-06, "loss": 0.0, "num_tokens": 1615896.0, "reward": 0.7593749761581421, "reward_std": 0.27784982323646545, "rewards/reward_func/mean": 0.7593749761581421, "rewards/reward_func/std": 0.44555091857910156, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7178698666393757, "epoch": 0.673828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21593737602233887, "learning_rate": 3.1894736842105266e-06, "loss": 0.0, "num_tokens": 1620656.0, "reward": 0.6245999932289124, "reward_std": 0.25092828273773193, "rewards/reward_func/mean": 0.6245999932289124, "rewards/reward_func/std": 0.4903907775878906, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7785428650677204, "epoch": 0.67578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23444195091724396, "learning_rate": 3.1842105263157898e-06, "loss": -0.0, "num_tokens": 1625144.0, "reward": 0.7519874572753906, "reward_std": 0.48887720704078674, "rewards/reward_func/mean": 0.7519874572753906, "rewards/reward_func/std": 0.4527363181114197, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6989172957837582, "epoch": 0.677734375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.178947368421053e-06, "loss": 0.0, "num_tokens": 1630144.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7183702476322651, "epoch": 0.6796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22046923637390137, "learning_rate": 3.173684210526316e-06, "loss": 0.0, "num_tokens": 1634800.0, "reward": 0.4881250262260437, "reward_std": 0.5570073127746582, "rewards/reward_func/mean": 0.4881250262260437, "rewards/reward_func/std": 0.5157098770141602, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7108337618410587, "epoch": 0.681640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22072257101535797, "learning_rate": 3.168421052631579e-06, "loss": 0.0, "num_tokens": 1639416.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6336832232773304, "epoch": 0.68359375, "frac_reward_zero_std": 0.5, "grad_norm": 0.16415120661258698, "learning_rate": 3.1631578947368424e-06, "loss": 0.0, "num_tokens": 1644272.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6293626204133034, "epoch": 0.685546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23304316401481628, "learning_rate": 3.157894736842105e-06, "loss": -0.0, "num_tokens": 1649232.0, "reward": 0.12349999696016312, "reward_std": 0.2251299023628235, "rewards/reward_func/mean": 0.12349999696016312, "rewards/reward_func/std": 0.3091523051261902, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6902714185416698, "epoch": 0.6875, "frac_reward_zero_std": 0.0, "grad_norm": 0.25497615337371826, "learning_rate": 3.1526315789473688e-06, "loss": -0.0, "num_tokens": 1653984.0, "reward": 0.8506499528884888, "reward_std": 0.2553556561470032, "rewards/reward_func/mean": 0.8506499528884888, "rewards/reward_func/std": 0.3441561758518219, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6613344326615334, "epoch": 0.689453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22403497993946075, "learning_rate": 3.147368421052632e-06, "loss": -0.0, "num_tokens": 1658976.0, "reward": 0.12974999845027924, "reward_std": 0.2284284085035324, "rewards/reward_func/mean": 0.12974999845027924, "rewards/reward_func/std": 0.30694079399108887, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5704779382795095, "epoch": 0.69140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2441885620355606, "learning_rate": 3.142105263157895e-06, "loss": -0.0, "num_tokens": 1663760.0, "reward": 0.004687500186264515, "reward_std": 0.006733439397066832, "rewards/reward_func/mean": 0.004687500186264515, "rewards/reward_func/std": 0.006469365209341049, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5089486921206117, "epoch": 0.693359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2328837364912033, "learning_rate": 3.1368421052631582e-06, "loss": 0.0, "num_tokens": 1668544.0, "reward": 0.3531000018119812, "reward_std": 0.5088821649551392, "rewards/reward_func/mean": 0.3531000018119812, "rewards/reward_func/std": 0.487506628036499, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.712410818785429, "epoch": 0.6953125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14800521731376648, "learning_rate": 3.131578947368421e-06, "loss": -0.0, "num_tokens": 1673408.0, "reward": 0.625, "reward_std": 0.25, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6078695468604565, "epoch": 0.697265625, "frac_reward_zero_std": 0.5, "grad_norm": 0.21499896049499512, "learning_rate": 3.1263157894736846e-06, "loss": 0.0, "num_tokens": 1677888.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7318508177995682, "epoch": 0.69921875, "frac_reward_zero_std": 0.5, "grad_norm": 0.18809860944747925, "learning_rate": 3.1210526315789473e-06, "loss": 0.0, "num_tokens": 1682352.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5955179370939732, "epoch": 0.701171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19998441636562347, "learning_rate": 3.115789473684211e-06, "loss": -0.0, "num_tokens": 1687200.0, "reward": 0.6271250247955322, "reward_std": 0.5026910901069641, "rewards/reward_func/mean": 0.6271250247955322, "rewards/reward_func/std": 0.4851891100406647, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4728749534115195, "epoch": 0.703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20382100343704224, "learning_rate": 3.1105263157894736e-06, "loss": 0.0, "num_tokens": 1691696.0, "reward": 0.8113000392913818, "reward_std": 0.23659999668598175, "rewards/reward_func/mean": 0.8113000392913818, "rewards/reward_func/std": 0.32786741852760315, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6648149769753218, "epoch": 0.705078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22265951335430145, "learning_rate": 3.1052631578947372e-06, "loss": -0.0, "num_tokens": 1696360.0, "reward": 0.6027500033378601, "reward_std": 0.5180938243865967, "rewards/reward_func/mean": 0.6027500033378601, "rewards/reward_func/std": 0.5003756880760193, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5913132168352604, "epoch": 0.70703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21587161719799042, "learning_rate": 3.1000000000000004e-06, "loss": 0.0, "num_tokens": 1701144.0, "reward": 0.7343375086784363, "reward_std": 0.4605855941772461, "rewards/reward_func/mean": 0.7343375086784363, "rewards/reward_func/std": 0.42718806862831116, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4091036804020405, "epoch": 0.708984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.17123205959796906, "learning_rate": 3.094736842105263e-06, "loss": -0.0, "num_tokens": 1705640.0, "reward": 0.5841249823570251, "reward_std": 0.4971931576728821, "rewards/reward_func/mean": 0.5841249823570251, "rewards/reward_func/std": 0.47686323523521423, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6606861557811499, "epoch": 0.7109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23936234414577484, "learning_rate": 3.0894736842105267e-06, "loss": -0.0, "num_tokens": 1710288.0, "reward": 0.504687488079071, "reward_std": 0.5720410346984863, "rewards/reward_func/mean": 0.504687488079071, "rewards/reward_func/std": 0.5296536087989807, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5874370019882917, "epoch": 0.712890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23465895652770996, "learning_rate": 3.0842105263157895e-06, "loss": -0.0, "num_tokens": 1714944.0, "reward": 0.6230999827384949, "reward_std": 0.519683837890625, "rewards/reward_func/mean": 0.6230999827384949, "rewards/reward_func/std": 0.5026271939277649, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8287493027746677, "epoch": 0.71484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24288125336170197, "learning_rate": 3.078947368421053e-06, "loss": -0.0, "num_tokens": 1719696.0, "reward": 0.6187999844551086, "reward_std": 0.5346147418022156, "rewards/reward_func/mean": 0.6187999844551086, "rewards/reward_func/std": 0.5125207901000977, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5209563355892897, "epoch": 0.716796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23250123858451843, "learning_rate": 3.0736842105263158e-06, "loss": 0.0, "num_tokens": 1724664.0, "reward": 0.0062500000931322575, "reward_std": 0.012500000186264515, "rewards/reward_func/mean": 0.0062500000931322575, "rewards/reward_func/std": 0.011572751216590405, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6145099401473999, "epoch": 0.71875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22183161973953247, "learning_rate": 3.0684210526315794e-06, "loss": -0.0, "num_tokens": 1729424.0, "reward": 0.5003125071525574, "reward_std": 0.5600941181182861, "rewards/reward_func/mean": 0.5003125071525574, "rewards/reward_func/std": 0.5186700820922852, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8084001671522856, "epoch": 0.720703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2436615526676178, "learning_rate": 3.0631578947368425e-06, "loss": 0.0, "num_tokens": 1734176.0, "reward": 0.4699999988079071, "reward_std": 0.542709231376648, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.5024511218070984, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6762276217341423, "epoch": 0.72265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23068293929100037, "learning_rate": 3.0578947368421053e-06, "loss": 0.0, "num_tokens": 1738800.0, "reward": 0.6190624833106995, "reward_std": 0.5181007981300354, "rewards/reward_func/mean": 0.6190624833106995, "rewards/reward_func/std": 0.4957561790943146, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6614982299506664, "epoch": 0.724609375, "frac_reward_zero_std": 0.5, "grad_norm": 0.14307619631290436, "learning_rate": 3.052631578947369e-06, "loss": 0.0, "num_tokens": 1743480.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.559177003800869, "epoch": 0.7265625, "frac_reward_zero_std": 0.5, "grad_norm": 0.15780669450759888, "learning_rate": 3.0473684210526316e-06, "loss": 0.0, "num_tokens": 1747824.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6498337648808956, "epoch": 0.728515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21541620790958405, "learning_rate": 3.042105263157895e-06, "loss": -0.0, "num_tokens": 1752608.0, "reward": 0.24124999344348907, "reward_std": 0.4659823775291443, "rewards/reward_func/mean": 0.24124999344348907, "rewards/reward_func/std": 0.4314158856868744, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5173940751701593, "epoch": 0.73046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.14444416761398315, "learning_rate": 3.036842105263158e-06, "loss": -0.0, "num_tokens": 1757576.0, "reward": 0.0015625000232830644, "reward_std": 0.0031250000465661287, "rewards/reward_func/mean": 0.0015625000232830644, "rewards/reward_func/std": 0.0044194175861775875, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5764465928077698, "epoch": 0.732421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.1886185109615326, "learning_rate": 3.0315789473684215e-06, "loss": -0.0, "num_tokens": 1762504.0, "reward": 0.2711625099182129, "reward_std": 0.29757219552993774, "rewards/reward_func/mean": 0.2711625099182129, "rewards/reward_func/std": 0.437508225440979, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8418979570269585, "epoch": 0.734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22740621864795685, "learning_rate": 3.0263157894736843e-06, "loss": 0.0, "num_tokens": 1767144.0, "reward": 0.35471248626708984, "reward_std": 0.49858343601226807, "rewards/reward_func/mean": 0.35471248626708984, "rewards/reward_func/std": 0.4733320474624634, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9156858287751675, "epoch": 0.736328125, "frac_reward_zero_std": 0.5, "grad_norm": 0.2001858800649643, "learning_rate": 3.0210526315789474e-06, "loss": 0.0, "num_tokens": 1771736.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6596429236233234, "epoch": 0.73828125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1892218291759491, "learning_rate": 3.015789473684211e-06, "loss": 0.0, "num_tokens": 1776168.0, "reward": 0.7515624761581421, "reward_std": 0.2868822515010834, "rewards/reward_func/mean": 0.7515624761581421, "rewards/reward_func/std": 0.4600290060043335, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5691553540527821, "epoch": 0.740234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20194444060325623, "learning_rate": 3.0105263157894737e-06, "loss": -0.0, "num_tokens": 1780904.0, "reward": 0.8247499465942383, "reward_std": 0.2395000010728836, "rewards/reward_func/mean": 0.8247500061988831, "rewards/reward_func/std": 0.3333088457584381, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 1.0326480194926262, "epoch": 0.7421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22807976603507996, "learning_rate": 3.0052631578947373e-06, "loss": 0.0, "num_tokens": 1785640.0, "reward": 0.6212124824523926, "reward_std": 0.2459665983915329, "rewards/reward_func/mean": 0.6212124824523926, "rewards/reward_func/std": 0.4976014196872711, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8465277478098869, "epoch": 0.744140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.33321791887283325, "learning_rate": 3e-06, "loss": 0.0, "num_tokens": 1790168.0, "reward": 0.8723000288009644, "reward_std": 0.25540000200271606, "rewards/reward_func/mean": 0.8723000288009644, "rewards/reward_func/std": 0.3525434732437134, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.2553594410419464, "epoch": 0.74609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.17427082359790802, "learning_rate": 2.9947368421052637e-06, "loss": 0.0, "num_tokens": 1794664.0, "reward": 0.5823000073432922, "reward_std": 0.5016319751739502, "rewards/reward_func/mean": 0.5823000073432922, "rewards/reward_func/std": 0.48220303654670715, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6799057051539421, "epoch": 0.748046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2410704791545868, "learning_rate": 2.9894736842105264e-06, "loss": 0.0, "num_tokens": 1799424.0, "reward": 0.7316499948501587, "reward_std": 0.4879480004310608, "rewards/reward_func/mean": 0.7316499948501587, "rewards/reward_func/std": 0.45203354954719543, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47508667781949043, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.2326003611087799, "learning_rate": 2.9842105263157896e-06, "loss": -0.0, "num_tokens": 1804088.0, "reward": 0.265625, "reward_std": 0.4900358021259308, "rewards/reward_func/mean": 0.265625, "rewards/reward_func/std": 0.4540861248970032, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8876021355390549, "epoch": 0.751953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24729052186012268, "learning_rate": 2.9789473684210527e-06, "loss": 0.0, "num_tokens": 1808592.0, "reward": 0.7473000288009644, "reward_std": 0.2940751314163208, "rewards/reward_func/mean": 0.7473000288009644, "rewards/reward_func/std": 0.4613037705421448, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.44519135542213917, "epoch": 0.75390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.18563015758991241, "learning_rate": 2.973684210526316e-06, "loss": 0.0, "num_tokens": 1813552.0, "reward": 0.22750000655651093, "reward_std": 0.25122225284576416, "rewards/reward_func/mean": 0.22749999165534973, "rewards/reward_func/std": 0.39082661271095276, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.740038525313139, "epoch": 0.755859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20772859454154968, "learning_rate": 2.9684210526315795e-06, "loss": -0.0, "num_tokens": 1818296.0, "reward": 0.6243749856948853, "reward_std": 0.2402016967535019, "rewards/reward_func/mean": 0.6243749856948853, "rewards/reward_func/std": 0.4830171763896942, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7946259453892708, "epoch": 0.7578125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14700205624103546, "learning_rate": 2.9631578947368422e-06, "loss": 0.0, "num_tokens": 1823056.0, "reward": 0.9474999904632568, "reward_std": 0.01500000525265932, "rewards/reward_func/mean": 0.9474999904632568, "rewards/reward_func/std": 0.02121320553123951, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5914437137544155, "epoch": 0.759765625, "frac_reward_zero_std": 0.5, "grad_norm": 0.21744617819786072, "learning_rate": 2.957894736842106e-06, "loss": 0.0, "num_tokens": 1827400.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6300115957856178, "epoch": 0.76171875, "frac_reward_zero_std": 0.5, "grad_norm": 0.16123859584331512, "learning_rate": 2.9526315789473685e-06, "loss": -0.0, "num_tokens": 1832408.0, "reward": 0.2418999969959259, "reward_std": 0.27935683727264404, "rewards/reward_func/mean": 0.2418999969959259, "rewards/reward_func/std": 0.44794899225234985, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6575647704303265, "epoch": 0.763671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.1817922294139862, "learning_rate": 2.9473684210526317e-06, "loss": -0.0, "num_tokens": 1837072.0, "reward": 0.3765625059604645, "reward_std": 0.5368822813034058, "rewards/reward_func/mean": 0.3765625059604645, "rewards/reward_func/std": 0.5162726044654846, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8860741257667542, "epoch": 0.765625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.942105263157895e-06, "loss": 0.0, "num_tokens": 1841600.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5626347567886114, "epoch": 0.767578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20179854333400726, "learning_rate": 2.936842105263158e-06, "loss": 0.0, "num_tokens": 1846392.0, "reward": 0.36006247997283936, "reward_std": 0.5131810903549194, "rewards/reward_func/mean": 0.36006247997283936, "rewards/reward_func/std": 0.493501216173172, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6695420295000076, "epoch": 0.76953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.25772982835769653, "learning_rate": 2.931578947368421e-06, "loss": 0.0, "num_tokens": 1851360.0, "reward": 0.11412499845027924, "reward_std": 0.22824999690055847, "rewards/reward_func/mean": 0.11412499845027924, "rewards/reward_func/std": 0.3128150701522827, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6333671249449253, "epoch": 0.771484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2417721301317215, "learning_rate": 2.9263157894736844e-06, "loss": -0.0, "num_tokens": 1856336.0, "reward": 0.12037499994039536, "reward_std": 0.22721248865127563, "rewards/reward_func/mean": 0.12037499994039536, "rewards/reward_func/std": 0.31041398644447327, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6140784211456776, "epoch": 0.7734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24161596596240997, "learning_rate": 2.921052631578948e-06, "loss": 0.0, "num_tokens": 1861136.0, "reward": 0.6094000339508057, "reward_std": 0.5234606862068176, "rewards/reward_func/mean": 0.6094000339508057, "rewards/reward_func/std": 0.5048978924751282, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.416771961376071, "epoch": 0.775390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19176459312438965, "learning_rate": 2.9157894736842107e-06, "loss": -0.0, "num_tokens": 1865976.0, "reward": 0.5075249671936035, "reward_std": 0.48087674379348755, "rewards/reward_func/mean": 0.5075249671936035, "rewards/reward_func/std": 0.5095809102058411, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7132963538169861, "epoch": 0.77734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21432211995124817, "learning_rate": 2.9105263157894743e-06, "loss": 0.0, "num_tokens": 1870416.0, "reward": 0.8723000288009644, "reward_std": 0.25540000200271606, "rewards/reward_func/mean": 0.8723000288009644, "rewards/reward_func/std": 0.3525434732437134, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6518449485301971, "epoch": 0.779296875, "frac_reward_zero_std": 0.5, "grad_norm": 0.16880500316619873, "learning_rate": 2.905263157894737e-06, "loss": 0.0, "num_tokens": 1875048.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8165195435285568, "epoch": 0.78125, "frac_reward_zero_std": 0.5, "grad_norm": 0.15151342749595642, "learning_rate": 2.9e-06, "loss": 0.0, "num_tokens": 1879520.0, "reward": 0.878125011920929, "reward_std": 0.24375002086162567, "rewards/reward_func/mean": 0.878125011920929, "rewards/reward_func/std": 0.34471458196640015, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5633701048791409, "epoch": 0.783203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.181041419506073, "learning_rate": 2.8947368421052634e-06, "loss": 0.0, "num_tokens": 1884368.0, "reward": 0.23649999499320984, "reward_std": 0.27312225103378296, "rewards/reward_func/mean": 0.23649999499320984, "rewards/reward_func/std": 0.4379509389400482, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.32244748808443546, "epoch": 0.78515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.17149122059345245, "learning_rate": 2.8894736842105265e-06, "loss": 0.0, "num_tokens": 1888872.0, "reward": 0.6985000371932983, "reward_std": 0.270952433347702, "rewards/reward_func/mean": 0.6985000371932983, "rewards/reward_func/std": 0.4311384856700897, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9212587662041187, "epoch": 0.787109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3085295259952545, "learning_rate": 2.88421052631579e-06, "loss": -0.0, "num_tokens": 1893352.0, "reward": 0.6285499930381775, "reward_std": 0.5285625457763672, "rewards/reward_func/mean": 0.6285499930381775, "rewards/reward_func/std": 0.5069750547409058, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7726543471217155, "epoch": 0.7890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23899753391742706, "learning_rate": 2.878947368421053e-06, "loss": -0.0, "num_tokens": 1898088.0, "reward": 0.8645749688148499, "reward_std": 0.25090357661247253, "rewards/reward_func/mean": 0.8645749688148499, "rewards/reward_func/std": 0.3395808935165405, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5518805952742696, "epoch": 0.791015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20844319462776184, "learning_rate": 2.8736842105263164e-06, "loss": -0.0, "num_tokens": 1903064.0, "reward": 0.2329374998807907, "reward_std": 0.2587106227874756, "rewards/reward_func/mean": 0.2329374998807907, "rewards/reward_func/std": 0.40477776527404785, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.27654310688376427, "epoch": 0.79296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.16797520220279694, "learning_rate": 2.868421052631579e-06, "loss": 0.0, "num_tokens": 1907568.0, "reward": 0.6930000185966492, "reward_std": 0.27320215106010437, "rewards/reward_func/mean": 0.6930000185966492, "rewards/reward_func/std": 0.4277917444705963, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6657938584685326, "epoch": 0.794921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.26008403301239014, "learning_rate": 2.8631578947368423e-06, "loss": -0.0, "num_tokens": 1912408.0, "reward": 0.3424000144004822, "reward_std": 0.4964672327041626, "rewards/reward_func/mean": 0.3424000144004822, "rewards/reward_func/std": 0.47299036383628845, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5552172213792801, "epoch": 0.796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2306947261095047, "learning_rate": 2.8578947368421055e-06, "loss": 0.0, "num_tokens": 1917192.0, "reward": 0.3631874918937683, "reward_std": 0.5111160278320312, "rewards/reward_func/mean": 0.3631874918937683, "rewards/reward_func/std": 0.4909226894378662, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9417452700436115, "epoch": 0.798828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.26336851716041565, "learning_rate": 2.8526315789473687e-06, "loss": 0.0, "num_tokens": 1921688.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5507835112512112, "epoch": 0.80078125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.847368421052632e-06, "loss": 0.0, "num_tokens": 1926336.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8155883587896824, "epoch": 0.802734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.25022512674331665, "learning_rate": 2.842105263157895e-06, "loss": -0.0, "num_tokens": 1930944.0, "reward": 0.9845499992370605, "reward_std": 0.025080181658267975, "rewards/reward_func/mean": 0.9845499992370605, "rewards/reward_func/std": 0.02366715855896473, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.2771544838324189, "epoch": 0.8046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.11498188227415085, "learning_rate": 2.8368421052631586e-06, "loss": -0.0, "num_tokens": 1935456.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5674882866442204, "epoch": 0.806640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.222222238779068, "learning_rate": 2.8315789473684213e-06, "loss": 0.0, "num_tokens": 1940104.0, "reward": 0.4912000000476837, "reward_std": 0.5675593614578247, "rewards/reward_func/mean": 0.4912000000476837, "rewards/reward_func/std": 0.5256202816963196, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.884392186999321, "epoch": 0.80859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2370668649673462, "learning_rate": 2.826315789473684e-06, "loss": 0.0, "num_tokens": 1944632.0, "reward": 0.5062500238418579, "reward_std": 0.5703184008598328, "rewards/reward_func/mean": 0.5062500238418579, "rewards/reward_func/std": 0.5280946493148804, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5134440064430237, "epoch": 0.810546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18468965590000153, "learning_rate": 2.8210526315789476e-06, "loss": 0.0, "num_tokens": 1949600.0, "reward": 0.13600000739097595, "reward_std": 0.2231663018465042, "rewards/reward_func/mean": 0.13600000739097595, "rewards/reward_func/std": 0.30529868602752686, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6705011464655399, "epoch": 0.8125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22873243689537048, "learning_rate": 2.815789473684211e-06, "loss": 0.0, "num_tokens": 1954392.0, "reward": 0.4972499907016754, "reward_std": 0.5742101669311523, "rewards/reward_func/mean": 0.4972499907016754, "rewards/reward_func/std": 0.5316314101219177, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9413636922836304, "epoch": 0.814453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22522786259651184, "learning_rate": 2.810526315789474e-06, "loss": 0.0, "num_tokens": 1958888.0, "reward": 0.6312500238418579, "reward_std": 0.5315045118331909, "rewards/reward_func/mean": 0.6312500238418579, "rewards/reward_func/std": 0.5090256929397583, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7534596920013428, "epoch": 0.81640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2349129617214203, "learning_rate": 2.805263157894737e-06, "loss": 0.0, "num_tokens": 1963752.0, "reward": 0.503125011920929, "reward_std": 0.5737417936325073, "rewards/reward_func/mean": 0.503125011920929, "rewards/reward_func/std": 0.5312027335166931, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7274225316941738, "epoch": 0.818359375, "frac_reward_zero_std": 0.5, "grad_norm": 0.17489460110664368, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "num_tokens": 1968624.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6172376144677401, "epoch": 0.8203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2108323872089386, "learning_rate": 2.7947368421052635e-06, "loss": -0.0, "num_tokens": 1973280.0, "reward": 0.754687488079071, "reward_std": 0.4906249940395355, "rewards/reward_func/mean": 0.754687488079071, "rewards/reward_func/std": 0.45434102416038513, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4672897644340992, "epoch": 0.822265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.203911691904068, "learning_rate": 2.789473684210526e-06, "loss": -0.0, "num_tokens": 1978264.0, "reward": 0.12350000441074371, "reward_std": 0.23065190017223358, "rewards/reward_func/mean": 0.12350000441074371, "rewards/reward_func/std": 0.30972936749458313, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.40012666769325733, "epoch": 0.82421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19789475202560425, "learning_rate": 2.7842105263157898e-06, "loss": 0.0, "num_tokens": 1983232.0, "reward": 0.1172500029206276, "reward_std": 0.2345000058412552, "rewards/reward_func/mean": 0.1172500029206276, "rewards/reward_func/std": 0.3119211196899414, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6508579887449741, "epoch": 0.826171875, "frac_reward_zero_std": 0.5, "grad_norm": 0.13198713958263397, "learning_rate": 2.7789473684210525e-06, "loss": -0.0, "num_tokens": 1987864.0, "reward": 0.9679999947547913, "reward_std": 0.008000005967915058, "rewards/reward_func/mean": 0.9679999947547913, "rewards/reward_func/std": 0.011313710361719131, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8873192630708218, "epoch": 0.828125, "frac_reward_zero_std": 0.5, "grad_norm": 0.17348502576351166, "learning_rate": 2.773684210526316e-06, "loss": -0.0, "num_tokens": 1992728.0, "reward": 0.9947500228881836, "reward_std": 0.010500003583729267, "rewards/reward_func/mean": 0.9947500228881836, "rewards/reward_func/std": 0.014849240891635418, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4722721241414547, "epoch": 0.830078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2051362246274948, "learning_rate": 2.7684210526315793e-06, "loss": 0.0, "num_tokens": 1997376.0, "reward": 0.49486249685287476, "reward_std": 0.4869329631328583, "rewards/reward_func/mean": 0.49486249685287476, "rewards/reward_func/std": 0.5131271481513977, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.29726838506758213, "epoch": 0.83203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.11943374574184418, "learning_rate": 2.7631578947368424e-06, "loss": 0.0, "num_tokens": 2001864.0, "reward": 0.8113000392913818, "reward_std": 0.23103395104408264, "rewards/reward_func/mean": 0.8113000392913818, "rewards/reward_func/std": 0.32786741852760315, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5415227115154266, "epoch": 0.833984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2265365570783615, "learning_rate": 2.7578947368421056e-06, "loss": -0.0, "num_tokens": 2006496.0, "reward": 0.6188374757766724, "reward_std": 0.2559331953525543, "rewards/reward_func/mean": 0.6188374757766724, "rewards/reward_func/std": 0.5024172067642212, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6843532212078571, "epoch": 0.8359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21580831706523895, "learning_rate": 2.7526315789473683e-06, "loss": -0.0, "num_tokens": 2011144.0, "reward": 0.012500000186264515, "reward_std": 0.01781497895717621, "rewards/reward_func/mean": 0.012500000186264515, "rewards/reward_func/std": 0.01767767034471035, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5906918495893478, "epoch": 0.837890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2688174843788147, "learning_rate": 2.747368421052632e-06, "loss": 0.0, "num_tokens": 2016000.0, "reward": 0.22010000050067902, "reward_std": 0.44020000100135803, "rewards/reward_func/mean": 0.22010000050067902, "rewards/reward_func/std": 0.40758687257766724, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.541338412091136, "epoch": 0.83984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.25273582339286804, "learning_rate": 2.7421052631578947e-06, "loss": 0.0, "num_tokens": 2020848.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5117669757455587, "epoch": 0.841796875, "frac_reward_zero_std": 0.5, "grad_norm": 0.14728084206581116, "learning_rate": 2.7368421052631583e-06, "loss": -0.0, "num_tokens": 2026016.0, "reward": 0.7562500238418579, "reward_std": 0.28164324164390564, "rewards/reward_func/mean": 0.7562500238418579, "rewards/reward_func/std": 0.45153507590293884, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7267967909574509, "epoch": 0.84375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23854367434978485, "learning_rate": 2.7315789473684214e-06, "loss": 0.0, "num_tokens": 2030440.0, "reward": 0.7425000071525574, "reward_std": 0.4954078197479248, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.45874831080436707, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.582302026450634, "epoch": 0.845703125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1844756305217743, "learning_rate": 2.7263157894736846e-06, "loss": -0.0, "num_tokens": 2035200.0, "reward": 0.8224999904632568, "reward_std": 0.23499999940395355, "rewards/reward_func/mean": 0.8224999904632568, "rewards/reward_func/std": 0.33234018087387085, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.816022414714098, "epoch": 0.84765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23570871353149414, "learning_rate": 2.7210526315789478e-06, "loss": -0.0, "num_tokens": 2039984.0, "reward": 0.35731250047683716, "reward_std": 0.496307909488678, "rewards/reward_func/mean": 0.35731250047683716, "rewards/reward_func/std": 0.4760379195213318, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.48291225358843803, "epoch": 0.849609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18518958985805511, "learning_rate": 2.7157894736842105e-06, "loss": -0.0, "num_tokens": 2044624.0, "reward": 0.512499988079071, "reward_std": 0.5633107423782349, "rewards/reward_func/mean": 0.512499988079071, "rewards/reward_func/std": 0.5215447545051575, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6258512511849403, "epoch": 0.8515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20368583500385284, "learning_rate": 2.710526315789474e-06, "loss": -0.0, "num_tokens": 2049472.0, "reward": 0.23640000820159912, "reward_std": 0.47280001640319824, "rewards/reward_func/mean": 0.23640000820159912, "rewards/reward_func/std": 0.43869248032569885, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5136358644813299, "epoch": 0.853515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2028164565563202, "learning_rate": 2.705263157894737e-06, "loss": -0.0, "num_tokens": 2054240.0, "reward": 0.25462499260902405, "reward_std": 0.2947234511375427, "rewards/reward_func/mean": 0.25462499260902405, "rewards/reward_func/std": 0.43380802869796753, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9727239087224007, "epoch": 0.85546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2779141068458557, "learning_rate": 2.7000000000000004e-06, "loss": -0.0, "num_tokens": 2058760.0, "reward": 0.512499988079071, "reward_std": 0.5629165172576904, "rewards/reward_func/mean": 0.512499988079071, "rewards/reward_func/std": 0.5215019583702087, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7157435975968838, "epoch": 0.857421875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.694736842105263e-06, "loss": 0.0, "num_tokens": 2063616.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.480139946565032, "epoch": 0.859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24946144223213196, "learning_rate": 2.6894736842105267e-06, "loss": 0.0, "num_tokens": 2068552.0, "reward": 0.2633500099182129, "reward_std": 0.47686171531677246, "rewards/reward_func/mean": 0.2633500099182129, "rewards/reward_func/std": 0.44164177775382996, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5838107727468014, "epoch": 0.861328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24031829833984375, "learning_rate": 2.68421052631579e-06, "loss": 0.0, "num_tokens": 2073352.0, "reward": 0.7457625269889832, "reward_std": 0.46000391244888306, "rewards/reward_func/mean": 0.7457624673843384, "rewards/reward_func/std": 0.4260120391845703, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4704406224191189, "epoch": 0.86328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23864911496639252, "learning_rate": 2.6789473684210526e-06, "loss": -0.0, "num_tokens": 2078136.0, "reward": 0.2421249896287918, "reward_std": 0.4759899377822876, "rewards/reward_func/mean": 0.2421249896287918, "rewards/reward_func/std": 0.44069764018058777, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49487001448869705, "epoch": 0.865234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19437025487422943, "learning_rate": 2.6736842105263162e-06, "loss": 0.0, "num_tokens": 2082904.0, "reward": 0.2421249896287918, "reward_std": 0.27958187460899353, "rewards/reward_func/mean": 0.2421249896287918, "rewards/reward_func/std": 0.4406470060348511, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6033957190811634, "epoch": 0.8671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21896040439605713, "learning_rate": 2.668421052631579e-06, "loss": -0.0, "num_tokens": 2087560.0, "reward": 0.5024124979972839, "reward_std": 0.4808311462402344, "rewards/reward_func/mean": 0.5024124979972839, "rewards/reward_func/std": 0.5208636522293091, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5963526703417301, "epoch": 0.869140625, "frac_reward_zero_std": 0.5, "grad_norm": 0.17856237292289734, "learning_rate": 2.6631578947368426e-06, "loss": 0.0, "num_tokens": 2092416.0, "reward": 0.8560999631881714, "reward_std": 0.24459999799728394, "rewards/reward_func/mean": 0.8560999631881714, "rewards/reward_func/std": 0.34591665863990784, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49352680519223213, "epoch": 0.87109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24865403771400452, "learning_rate": 2.6578947368421053e-06, "loss": 0.0, "num_tokens": 2097096.0, "reward": 0.7447500228881836, "reward_std": 0.4966987371444702, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6915704943239689, "epoch": 0.873046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2585187554359436, "learning_rate": 2.652631578947369e-06, "loss": 0.0, "num_tokens": 2102104.0, "reward": 0.2282000035047531, "reward_std": 0.4564000070095062, "rewards/reward_func/mean": 0.2282000035047531, "rewards/reward_func/std": 0.42270201444625854, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7159839831292629, "epoch": 0.875, "frac_reward_zero_std": 0.5, "grad_norm": 0.20453768968582153, "learning_rate": 2.6473684210526316e-06, "loss": 0.0, "num_tokens": 2107120.0, "reward": 0.11140000075101852, "reward_std": 0.22280000150203705, "rewards/reward_func/mean": 0.11140000075101852, "rewards/reward_func/std": 0.3150867819786072, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6321196630597115, "epoch": 0.876953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2514483332633972, "learning_rate": 2.6421052631578948e-06, "loss": -0.0, "num_tokens": 2111864.0, "reward": 0.7442624568939209, "reward_std": 0.47556716203689575, "rewards/reward_func/mean": 0.7442624568939209, "rewards/reward_func/std": 0.4406154155731201, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5473988149315119, "epoch": 0.87890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2295861393213272, "learning_rate": 2.6368421052631584e-06, "loss": 0.0, "num_tokens": 2116800.0, "reward": 0.48919999599456787, "reward_std": 0.48919999599456787, "rewards/reward_func/mean": 0.48919999599456787, "rewards/reward_func/std": 0.522976815700531, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6474330313503742, "epoch": 0.880859375, "frac_reward_zero_std": 0.5, "grad_norm": 0.13752923905849457, "learning_rate": 2.631578947368421e-06, "loss": 0.0, "num_tokens": 2121664.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47194433584809303, "epoch": 0.8828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22527579963207245, "learning_rate": 2.6263157894736847e-06, "loss": 0.0, "num_tokens": 2126328.0, "reward": 0.5039750337600708, "reward_std": 0.560730516910553, "rewards/reward_func/mean": 0.5039750337600708, "rewards/reward_func/std": 0.5192855596542358, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.696033101528883, "epoch": 0.884765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21090690791606903, "learning_rate": 2.6210526315789474e-06, "loss": -0.0, "num_tokens": 2130864.0, "reward": 0.7397750020027161, "reward_std": 0.28458285331726074, "rewards/reward_func/mean": 0.7397749423980713, "rewards/reward_func/std": 0.4491344392299652, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5411256179213524, "epoch": 0.88671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18700988590717316, "learning_rate": 2.615789473684211e-06, "loss": 0.0, "num_tokens": 2135832.0, "reward": 0.3330000042915344, "reward_std": 0.47834351658821106, "rewards/reward_func/mean": 0.3330000042915344, "rewards/reward_func/std": 0.45958366990089417, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.602843264117837, "epoch": 0.888671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19545380771160126, "learning_rate": 2.6105263157894738e-06, "loss": -0.0, "num_tokens": 2140624.0, "reward": 0.24056249856948853, "reward_std": 0.47697657346725464, "rewards/reward_func/mean": 0.24056249856948853, "rewards/reward_func/std": 0.44159868359565735, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7841911315917969, "epoch": 0.890625, "frac_reward_zero_std": 0.5, "grad_norm": 0.16834405064582825, "learning_rate": 2.605263157894737e-06, "loss": 0.0, "num_tokens": 2145096.0, "reward": 0.7562500238418579, "reward_std": 0.2814582586288452, "rewards/reward_func/mean": 0.7562500238418579, "rewards/reward_func/std": 0.4513373076915741, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4610623624175787, "epoch": 0.892578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22504417598247528, "learning_rate": 2.6e-06, "loss": 0.0, "num_tokens": 2149848.0, "reward": 0.7124999761581421, "reward_std": 0.47541630268096924, "rewards/reward_func/mean": 0.7125000357627869, "rewards/reward_func/std": 0.44025152921676636, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5260163843631744, "epoch": 0.89453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2690121531486511, "learning_rate": 2.5947368421052633e-06, "loss": 0.0, "num_tokens": 2154696.0, "reward": 0.350600004196167, "reward_std": 0.4991450905799866, "rewards/reward_func/mean": 0.350600004196167, "rewards/reward_func/std": 0.48443493247032166, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8946198597550392, "epoch": 0.896484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.25450319051742554, "learning_rate": 2.589473684210527e-06, "loss": 0.0, "num_tokens": 2159176.0, "reward": 0.9945999979972839, "reward_std": 0.010800004005432129, "rewards/reward_func/mean": 0.9945999979972839, "rewards/reward_func/std": 0.009998860768973827, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3343417225405574, "epoch": 0.8984375, "frac_reward_zero_std": 0.5, "grad_norm": 0.23701933026313782, "learning_rate": 2.5842105263157896e-06, "loss": -0.0, "num_tokens": 2163664.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5373089350759983, "epoch": 0.900390625, "frac_reward_zero_std": 0.5, "grad_norm": 0.14844948053359985, "learning_rate": 2.578947368421053e-06, "loss": 0.0, "num_tokens": 2168448.0, "reward": 0.23899999260902405, "reward_std": 0.27597343921661377, "rewards/reward_func/mean": 0.23899999260902405, "rewards/reward_func/std": 0.44254201650619507, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6289132609963417, "epoch": 0.90234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2263883650302887, "learning_rate": 2.573684210526316e-06, "loss": 0.0, "num_tokens": 2173248.0, "reward": 0.6249375343322754, "reward_std": 0.5195988416671753, "rewards/reward_func/mean": 0.6249375343322754, "rewards/reward_func/std": 0.4944932758808136, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9023691639304161, "epoch": 0.904296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3016529679298401, "learning_rate": 2.568421052631579e-06, "loss": 0.0, "num_tokens": 2177720.0, "reward": 0.6285499930381775, "reward_std": 0.25130394101142883, "rewards/reward_func/mean": 0.6285499930381775, "rewards/reward_func/std": 0.5067988634109497, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5828330963850021, "epoch": 0.90625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20633725821971893, "learning_rate": 2.5631578947368422e-06, "loss": 0.0, "num_tokens": 2182560.0, "reward": 0.6124000549316406, "reward_std": 0.48071789741516113, "rewards/reward_func/mean": 0.6124000549316406, "rewards/reward_func/std": 0.46755528450012207, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.421830615028739, "epoch": 0.908203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.1946926712989807, "learning_rate": 2.5578947368421054e-06, "loss": -0.0, "num_tokens": 2187512.0, "reward": 0.24700000882148743, "reward_std": 0.2572190761566162, "rewards/reward_func/mean": 0.24700000882148743, "rewards/reward_func/std": 0.39629149436950684, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.38339319732040167, "epoch": 0.91015625, "frac_reward_zero_std": 0.5, "grad_norm": 0.11662346869707108, "learning_rate": 2.552631578947369e-06, "loss": 0.0, "num_tokens": 2192024.0, "reward": 0.8113000392913818, "reward_std": 0.23103395104408264, "rewards/reward_func/mean": 0.8113000392913818, "rewards/reward_func/std": 0.32786741852760315, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.733765222132206, "epoch": 0.912109375, "frac_reward_zero_std": 0.5, "grad_norm": 0.13307461142539978, "learning_rate": 2.5473684210526317e-06, "loss": -0.0, "num_tokens": 2197040.0, "reward": 0.11410000175237656, "reward_std": 0.2282000035047531, "rewards/reward_func/mean": 0.11410000175237656, "rewards/reward_func/std": 0.3227235674858093, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5777375996112823, "epoch": 0.9140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2451069951057434, "learning_rate": 2.5421052631578953e-06, "loss": 0.0, "num_tokens": 2201888.0, "reward": 0.58160001039505, "reward_std": 0.5065810680389404, "rewards/reward_func/mean": 0.58160001039505, "rewards/reward_func/std": 0.4829333424568176, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5028033722192049, "epoch": 0.916015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23731045424938202, "learning_rate": 2.536842105263158e-06, "loss": 0.0, "num_tokens": 2206744.0, "reward": 0.7462999820709229, "reward_std": 0.4641999900341034, "rewards/reward_func/mean": 0.7462999820709229, "rewards/reward_func/std": 0.4297657012939453, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49811795353889465, "epoch": 0.91796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20878010988235474, "learning_rate": 2.5315789473684212e-06, "loss": 0.0, "num_tokens": 2211416.0, "reward": 0.7425000071525574, "reward_std": 0.30367511510849, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.4587482810020447, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.36679044365882874, "epoch": 0.919921875, "frac_reward_zero_std": 0.5, "grad_norm": 0.12484736740589142, "learning_rate": 2.5263157894736844e-06, "loss": -0.0, "num_tokens": 2215912.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5753449611365795, "epoch": 0.921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23447397351264954, "learning_rate": 2.5210526315789475e-06, "loss": 0.0, "num_tokens": 2220568.0, "reward": 0.38792502880096436, "reward_std": 0.5182595252990723, "rewards/reward_func/mean": 0.38792499899864197, "rewards/reward_func/std": 0.5017743706703186, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4622494773939252, "epoch": 0.923828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.25002261996269226, "learning_rate": 2.5157894736842107e-06, "loss": 0.0, "num_tokens": 2225352.0, "reward": 0.249937504529953, "reward_std": 0.47128826379776, "rewards/reward_func/mean": 0.249937504529953, "rewards/reward_func/std": 0.4365290105342865, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5469172280281782, "epoch": 0.92578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20600056648254395, "learning_rate": 2.510526315789474e-06, "loss": 0.0, "num_tokens": 2230336.0, "reward": 0.22462499141693115, "reward_std": 0.44099122285842896, "rewards/reward_func/mean": 0.22462499141693115, "rewards/reward_func/std": 0.4093196392059326, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6319026313722134, "epoch": 0.927734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3944365382194519, "learning_rate": 2.5052631578947375e-06, "loss": 0.0, "num_tokens": 2235096.0, "reward": 0.5051125288009644, "reward_std": 0.4934867322444916, "rewards/reward_func/mean": 0.5051125288009644, "rewards/reward_func/std": 0.5237316489219666, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.34385241754353046, "epoch": 0.9296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18634098768234253, "learning_rate": 2.5e-06, "loss": 0.0, "num_tokens": 2240048.0, "reward": 0.4440000057220459, "reward_std": 0.5151644945144653, "rewards/reward_func/mean": 0.4440000057220459, "rewards/reward_func/std": 0.47747913002967834, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5607694983482361, "epoch": 0.931640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19195789098739624, "learning_rate": 2.4947368421052634e-06, "loss": -0.0, "num_tokens": 2244704.0, "reward": 0.6195999979972839, "reward_std": 0.25623539090156555, "rewards/reward_func/mean": 0.6195999979972839, "rewards/reward_func/std": 0.513155460357666, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6188374534249306, "epoch": 0.93359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19206063449382782, "learning_rate": 2.4894736842105265e-06, "loss": 0.0, "num_tokens": 2249352.0, "reward": 0.7438000440597534, "reward_std": 0.4960067868232727, "rewards/reward_func/mean": 0.7437999844551086, "rewards/reward_func/std": 0.45921406149864197, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7727699540555477, "epoch": 0.935546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24975650012493134, "learning_rate": 2.4842105263157897e-06, "loss": -0.0, "num_tokens": 2254096.0, "reward": 0.7468999624252319, "reward_std": 0.46515488624572754, "rewards/reward_func/mean": 0.7468999624252319, "rewards/reward_func/std": 0.43068981170654297, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5285328812897205, "epoch": 0.9375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18033409118652344, "learning_rate": 2.478947368421053e-06, "loss": 0.0, "num_tokens": 2259064.0, "reward": 0.22200000286102295, "reward_std": 0.4440000057220459, "rewards/reward_func/mean": 0.22200000286102295, "rewards/reward_func/std": 0.41106414794921875, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6708570122718811, "epoch": 0.939453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23435814678668976, "learning_rate": 2.473684210526316e-06, "loss": 0.0, "num_tokens": 2263720.0, "reward": 0.3711249828338623, "reward_std": 0.5285789370536804, "rewards/reward_func/mean": 0.3711249828338623, "rewards/reward_func/std": 0.5054450631141663, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6528993677347898, "epoch": 0.94140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23905879259109497, "learning_rate": 2.468421052631579e-06, "loss": -0.0, "num_tokens": 2268472.0, "reward": 0.6135500073432922, "reward_std": 0.5113159418106079, "rewards/reward_func/mean": 0.6135500073432922, "rewards/reward_func/std": 0.4950350224971771, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47676665522158146, "epoch": 0.943359375, "frac_reward_zero_std": 0.5, "grad_norm": 0.145980104804039, "learning_rate": 2.4631578947368424e-06, "loss": 0.0, "num_tokens": 2273312.0, "reward": 0.8654749989509583, "reward_std": 0.22585000097751617, "rewards/reward_func/mean": 0.8654749989509583, "rewards/reward_func/std": 0.3194001615047455, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5406187996268272, "epoch": 0.9453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23957812786102295, "learning_rate": 2.4578947368421055e-06, "loss": 0.0, "num_tokens": 2278288.0, "reward": 0.2150000035762787, "reward_std": 0.4300000071525574, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.39838388562202454, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5646249614655972, "epoch": 0.947265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2331782728433609, "learning_rate": 2.4526315789473687e-06, "loss": 0.0, "num_tokens": 2283248.0, "reward": 0.11626250296831131, "reward_std": 0.2122942954301834, "rewards/reward_func/mean": 0.11626250296831131, "rewards/reward_func/std": 0.28399014472961426, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5729491971433163, "epoch": 0.94921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22905918955802917, "learning_rate": 2.447368421052632e-06, "loss": 0.0, "num_tokens": 2287904.0, "reward": 0.49886250495910645, "reward_std": 0.4972279667854309, "rewards/reward_func/mean": 0.49886250495910645, "rewards/reward_func/std": 0.5300286412239075, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6615194007754326, "epoch": 0.951171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.26248008012771606, "learning_rate": 2.442105263157895e-06, "loss": 0.0, "num_tokens": 2292752.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5113859847187996, "epoch": 0.953125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14094099402427673, "learning_rate": 2.436842105263158e-06, "loss": -0.0, "num_tokens": 2297416.0, "reward": 0.754687488079071, "reward_std": 0.2833658754825592, "rewards/reward_func/mean": 0.754687488079071, "rewards/reward_func/std": 0.45434102416038513, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6214317940175533, "epoch": 0.955078125, "frac_reward_zero_std": 0.5, "grad_norm": 0.16864041984081268, "learning_rate": 2.4315789473684213e-06, "loss": 0.0, "num_tokens": 2301776.0, "reward": 0.6328125, "reward_std": 0.24523451924324036, "rewards/reward_func/mean": 0.6328125, "rewards/reward_func/std": 0.5071338415145874, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5048791384324431, "epoch": 0.95703125, "frac_reward_zero_std": 0.5, "grad_norm": 0.13176527619361877, "learning_rate": 2.4263157894736845e-06, "loss": -0.0, "num_tokens": 2306272.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8380111455917358, "epoch": 0.958984375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1738596111536026, "learning_rate": 2.4210526315789477e-06, "loss": -0.0, "num_tokens": 2310752.0, "reward": 0.754687488079071, "reward_std": 0.2833658754825592, "rewards/reward_func/mean": 0.754687488079071, "rewards/reward_func/std": 0.45434102416038513, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3626231402158737, "epoch": 0.9609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.1840285211801529, "learning_rate": 2.415789473684211e-06, "loss": -0.0, "num_tokens": 2315256.0, "reward": 0.6972000002861023, "reward_std": 0.46480000019073486, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7078562341630459, "epoch": 0.962890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20519906282424927, "learning_rate": 2.410526315789474e-06, "loss": -0.0, "num_tokens": 2320008.0, "reward": 0.488349974155426, "reward_std": 0.4877285361289978, "rewards/reward_func/mean": 0.4883500039577484, "rewards/reward_func/std": 0.52220219373703, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5425986461341381, "epoch": 0.96484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2026311159133911, "learning_rate": 2.4052631578947367e-06, "loss": 0.0, "num_tokens": 2324960.0, "reward": 0.24543750286102295, "reward_std": 0.2619554400444031, "rewards/reward_func/mean": 0.24543750286102295, "rewards/reward_func/std": 0.3972027599811554, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.54656021296978, "epoch": 0.966796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2986944913864136, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "num_tokens": 2329616.0, "reward": 0.26292499899864197, "reward_std": 0.4851202964782715, "rewards/reward_func/mean": 0.26292499899864197, "rewards/reward_func/std": 0.4491825997829437, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8450591489672661, "epoch": 0.96875, "frac_reward_zero_std": 0.0, "grad_norm": 0.36952802538871765, "learning_rate": 2.3947368421052635e-06, "loss": 0.0, "num_tokens": 2334088.0, "reward": 0.6332374811172485, "reward_std": 0.5205224752426147, "rewards/reward_func/mean": 0.6332374811172485, "rewards/reward_func/std": 0.5004647970199585, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6656617224216461, "epoch": 0.970703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.19458124041557312, "learning_rate": 2.3894736842105266e-06, "loss": 0.0, "num_tokens": 2339056.0, "reward": 0.3577499985694885, "reward_std": 0.46434903144836426, "rewards/reward_func/mean": 0.3577499985694885, "rewards/reward_func/std": 0.4540364146232605, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6396152228116989, "epoch": 0.97265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.25116077065467834, "learning_rate": 2.38421052631579e-06, "loss": -0.0, "num_tokens": 2343704.0, "reward": 0.37542498111724854, "reward_std": 0.5320296287536621, "rewards/reward_func/mean": 0.3754250109195709, "rewards/reward_func/std": 0.5113484263420105, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4765377175062895, "epoch": 0.974609375, "frac_reward_zero_std": 0.5, "grad_norm": 0.15191896259784698, "learning_rate": 2.378947368421053e-06, "loss": 0.0, "num_tokens": 2348496.0, "reward": 0.23672500252723694, "reward_std": 0.26632454991340637, "rewards/reward_func/mean": 0.23672500252723694, "rewards/reward_func/std": 0.43085548281669617, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7641369961202145, "epoch": 0.9765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2522798776626587, "learning_rate": 2.373684210526316e-06, "loss": 0.0, "num_tokens": 2353504.0, "reward": 0.475600004196167, "reward_std": 0.5505199432373047, "rewards/reward_func/mean": 0.475600004196167, "rewards/reward_func/std": 0.5098142623901367, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5594486184418201, "epoch": 0.978515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24103155732154846, "learning_rate": 2.368421052631579e-06, "loss": -0.0, "num_tokens": 2358360.0, "reward": 0.8697500228881836, "reward_std": 0.2605000138282776, "rewards/reward_func/mean": 0.8697500228881836, "rewards/reward_func/std": 0.3517392575740814, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5576187632977962, "epoch": 0.98046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.22993963956832886, "learning_rate": 2.363157894736842e-06, "loss": 0.0, "num_tokens": 2362984.0, "reward": 0.878125011920929, "reward_std": 0.24375002086162567, "rewards/reward_func/mean": 0.878125011920929, "rewards/reward_func/std": 0.34471458196640015, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7988773807883263, "epoch": 0.982421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.25204092264175415, "learning_rate": 2.357894736842105e-06, "loss": -0.0, "num_tokens": 2367472.0, "reward": 0.2640625238418579, "reward_std": 0.49094337224960327, "rewards/reward_func/mean": 0.2640624940395355, "rewards/reward_func/std": 0.45456206798553467, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5971827898174524, "epoch": 0.984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21355661749839783, "learning_rate": 2.352631578947369e-06, "loss": 0.0, "num_tokens": 2372120.0, "reward": 0.7473000288009644, "reward_std": 0.4982522130012512, "rewards/reward_func/mean": 0.7473000288009644, "rewards/reward_func/std": 0.4613037705421448, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5676711704581976, "epoch": 0.986328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.37902572751045227, "learning_rate": 2.347368421052632e-06, "loss": 0.0, "num_tokens": 2377088.0, "reward": 0.3488750159740448, "reward_std": 0.4752463102340698, "rewards/reward_func/mean": 0.3488750159740448, "rewards/reward_func/std": 0.4613319933414459, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7709695436060429, "epoch": 0.98828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24499008059501648, "learning_rate": 2.342105263157895e-06, "loss": 0.0, "num_tokens": 2381888.0, "reward": 0.7379875183105469, "reward_std": 0.4632129669189453, "rewards/reward_func/mean": 0.7379875183105469, "rewards/reward_func/std": 0.42976444959640503, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4376491168513894, "epoch": 0.990234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22884558141231537, "learning_rate": 2.3368421052631583e-06, "loss": 0.0, "num_tokens": 2386376.0, "reward": 0.46480000019073486, "reward_std": 0.5367048382759094, "rewards/reward_func/mean": 0.46480000019073486, "rewards/reward_func/std": 0.49689212441444397, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8945914134383202, "epoch": 0.9921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24041084945201874, "learning_rate": 2.331578947368421e-06, "loss": 0.0, "num_tokens": 2390872.0, "reward": 0.5062500238418579, "reward_std": 0.49611565470695496, "rewards/reward_func/mean": 0.5062500238418579, "rewards/reward_func/std": 0.5280946493148804, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8071067947894335, "epoch": 0.994140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.25361520051956177, "learning_rate": 2.326315789473684e-06, "loss": 0.0, "num_tokens": 2395344.0, "reward": 0.507099986076355, "reward_std": 0.5571578741073608, "rewards/reward_func/mean": 0.507099986076355, "rewards/reward_func/std": 0.5159706473350525, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.576714625582099, "epoch": 0.99609375, "frac_reward_zero_std": 0.5, "grad_norm": 0.13069601356983185, "learning_rate": 2.3210526315789473e-06, "loss": 0.0, "num_tokens": 2400304.0, "reward": 0.11100000143051147, "reward_std": 0.22200000286102295, "rewards/reward_func/mean": 0.11100000143051147, "rewards/reward_func/std": 0.3139553964138031, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5591612830758095, "epoch": 0.998046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.16368049383163452, "learning_rate": 2.3157894736842105e-06, "loss": 0.0, "num_tokens": 2405088.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/reward_func/mean": 0.0031250000465661287, "rewards/reward_func/std": 0.008838835172355175, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7550145424902439, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.2605975568294525, "learning_rate": 2.310526315789474e-06, "loss": 0.0, "num_tokens": 2409848.0, "reward": 0.36937499046325684, "reward_std": 0.5097146034240723, "rewards/reward_func/mean": 0.36937499046325684, "rewards/reward_func/std": 0.49009066820144653, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6688739284873009, "epoch": 1.001953125, "frac_reward_zero_std": 0.5, "grad_norm": 0.16578584909439087, "learning_rate": 2.3052631578947373e-06, "loss": 0.0, "num_tokens": 2414288.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7435273714363575, "epoch": 1.00390625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.3000000000000004e-06, "loss": 0.0, "num_tokens": 2418728.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6509594097733498, "epoch": 1.005859375, "frac_reward_zero_std": 0.5, "grad_norm": 0.16100262105464935, "learning_rate": 2.294736842105263e-06, "loss": -0.0, "num_tokens": 2423488.0, "reward": 0.8224999904632568, "reward_std": 0.23499999940395355, "rewards/reward_func/mean": 0.8224999904632568, "rewards/reward_func/std": 0.33234018087387085, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6421031467616558, "epoch": 1.0078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20752696692943573, "learning_rate": 2.2894736842105263e-06, "loss": 0.0, "num_tokens": 2428328.0, "reward": 0.3933500051498413, "reward_std": 0.24158133566379547, "rewards/reward_func/mean": 0.3933500051498413, "rewards/reward_func/std": 0.4744868576526642, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.2608198570087552, "epoch": 1.009765625, "frac_reward_zero_std": 0.5, "grad_norm": 0.09608624875545502, "learning_rate": 2.2842105263157895e-06, "loss": 0.0, "num_tokens": 2432816.0, "reward": 0.6972000002861023, "reward_std": 0.2683524191379547, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7466496359556913, "epoch": 1.01171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2451416254043579, "learning_rate": 2.2789473684210527e-06, "loss": -0.0, "num_tokens": 2437616.0, "reward": 0.8708249926567078, "reward_std": 0.2384064942598343, "rewards/reward_func/mean": 0.8708249926567078, "rewards/reward_func/std": 0.3219219446182251, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6013193130493164, "epoch": 1.013671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20224137604236603, "learning_rate": 2.273684210526316e-06, "loss": -0.0, "num_tokens": 2442272.0, "reward": 0.36959999799728394, "reward_std": 0.5324397087097168, "rewards/reward_func/mean": 0.36959999799728394, "rewards/reward_func/std": 0.5101400017738342, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7230842337012291, "epoch": 1.015625, "frac_reward_zero_std": 0.5, "grad_norm": 0.16407498717308044, "learning_rate": 2.268421052631579e-06, "loss": 0.0, "num_tokens": 2447272.0, "reward": 0.11680000275373459, "reward_std": 0.23360000550746918, "rewards/reward_func/mean": 0.11680000275373459, "rewards/reward_func/std": 0.3303602933883667, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.39533343352377415, "epoch": 1.017578125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.2631578947368426e-06, "loss": 0.0, "num_tokens": 2451776.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6934016719460487, "epoch": 1.01953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20798802375793457, "learning_rate": 2.2578947368421057e-06, "loss": 0.0, "num_tokens": 2456392.0, "reward": 0.7335000038146973, "reward_std": 0.48899999260902405, "rewards/reward_func/mean": 0.7335000038146973, "rewards/reward_func/std": 0.4527260363101959, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49564963206648827, "epoch": 1.021484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1638687700033188, "learning_rate": 2.2526315789473685e-06, "loss": 0.0, "num_tokens": 2461176.0, "reward": 0.23899999260902405, "reward_std": 0.27597343921661377, "rewards/reward_func/mean": 0.23899999260902405, "rewards/reward_func/std": 0.44254201650619507, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4542197324335575, "epoch": 1.0234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18925493955612183, "learning_rate": 2.2473684210526316e-06, "loss": 0.0, "num_tokens": 2465840.0, "reward": 0.7464874982833862, "reward_std": 0.2938106656074524, "rewards/reward_func/mean": 0.7464874982833862, "rewards/reward_func/std": 0.449546217918396, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7139078304171562, "epoch": 1.025390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23500654101371765, "learning_rate": 2.242105263157895e-06, "loss": 0.0, "num_tokens": 2470688.0, "reward": 0.3981500267982483, "reward_std": 0.49756962060928345, "rewards/reward_func/mean": 0.3981499969959259, "rewards/reward_func/std": 0.4812353849411011, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8366854265332222, "epoch": 1.02734375, "frac_reward_zero_std": 0.5, "grad_norm": 0.18551307916641235, "learning_rate": 2.236842105263158e-06, "loss": 0.0, "num_tokens": 2475448.0, "reward": 0.7081249952316284, "reward_std": 0.26779481768608093, "rewards/reward_func/mean": 0.7081249952316284, "rewards/reward_func/std": 0.42940106987953186, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6392475813627243, "epoch": 1.029296875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.231578947368421e-06, "loss": 0.0, "num_tokens": 2479800.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7336869612336159, "epoch": 1.03125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2812504172325134, "learning_rate": 2.2263157894736843e-06, "loss": 0.0, "num_tokens": 2484304.0, "reward": 0.6352249979972839, "reward_std": 0.5144467949867249, "rewards/reward_func/mean": 0.6352249979972839, "rewards/reward_func/std": 0.4920179843902588, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6500775553286076, "epoch": 1.033203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2616865932941437, "learning_rate": 2.221052631578948e-06, "loss": -0.0, "num_tokens": 2489104.0, "reward": 0.2381249964237213, "reward_std": 0.27268749475479126, "rewards/reward_func/mean": 0.2381249964237213, "rewards/reward_func/std": 0.43324100971221924, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6904492862522602, "epoch": 1.03515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20736540853977203, "learning_rate": 2.2157894736842106e-06, "loss": 0.0, "num_tokens": 2494112.0, "reward": 0.350600004196167, "reward_std": 0.5038028955459595, "rewards/reward_func/mean": 0.350600004196167, "rewards/reward_func/std": 0.48401686549186707, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8729193657636642, "epoch": 1.037109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2569698393344879, "learning_rate": 2.2105263157894738e-06, "loss": 0.0, "num_tokens": 2498864.0, "reward": 0.4913625121116638, "reward_std": 0.5638903379440308, "rewards/reward_func/mean": 0.4913625121116638, "rewards/reward_func/std": 0.5222924947738647, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9219260476529598, "epoch": 1.0390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22864028811454773, "learning_rate": 2.205263157894737e-06, "loss": -0.0, "num_tokens": 2503656.0, "reward": 0.7295999526977539, "reward_std": 0.300462931394577, "rewards/reward_func/mean": 0.7296000123023987, "rewards/reward_func/std": 0.4509044587612152, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 1.0916531383991241, "epoch": 1.041015625, "frac_reward_zero_std": 0.5, "grad_norm": 0.2035861611366272, "learning_rate": 2.2e-06, "loss": 0.0, "num_tokens": 2508128.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47885562665760517, "epoch": 1.04296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19585774838924408, "learning_rate": 2.1947368421052633e-06, "loss": 0.0, "num_tokens": 2513104.0, "reward": 0.5544999837875366, "reward_std": 0.4817962646484375, "rewards/reward_func/mean": 0.5544999837875366, "rewards/reward_func/std": 0.4600767195224762, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.2929891608655453, "epoch": 1.044921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18502745032310486, "learning_rate": 2.1894736842105264e-06, "loss": -0.0, "num_tokens": 2517608.0, "reward": 0.7003250122070312, "reward_std": 0.45855000615119934, "rewards/reward_func/mean": 0.7003250122070312, "rewards/reward_func/std": 0.4245873987674713, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5622880086302757, "epoch": 1.046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19996067881584167, "learning_rate": 2.1842105263157896e-06, "loss": 0.0, "num_tokens": 2522272.0, "reward": 0.49806249141693115, "reward_std": 0.5715733170509338, "rewards/reward_func/mean": 0.49806249141693115, "rewards/reward_func/std": 0.5292056798934937, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.43428971618413925, "epoch": 1.048828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.1537165641784668, "learning_rate": 2.1789473684210528e-06, "loss": -0.0, "num_tokens": 2526936.0, "reward": 0.12306249886751175, "reward_std": 0.2461249977350235, "rewards/reward_func/mean": 0.12306249886751175, "rewards/reward_func/std": 0.34305045008659363, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7195999212563038, "epoch": 1.05078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2509689927101135, "learning_rate": 2.173684210526316e-06, "loss": 0.0, "num_tokens": 2531464.0, "reward": 0.6269875168800354, "reward_std": 0.5275543928146362, "rewards/reward_func/mean": 0.6269874572753906, "rewards/reward_func/std": 0.5090279579162598, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7064641937613487, "epoch": 1.052734375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.168421052631579e-06, "loss": 0.0, "num_tokens": 2536328.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6143765226006508, "epoch": 1.0546875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1631578947368423e-06, "loss": 0.0, "num_tokens": 2541080.0, "reward": 0.9399999976158142, "reward_std": 0.0, "rewards/reward_func/mean": 0.9399999976158142, "rewards/reward_func/std": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4053959520533681, "epoch": 1.056640625, "frac_reward_zero_std": 0.5, "grad_norm": 0.11423177272081375, "learning_rate": 2.1578947368421054e-06, "loss": -0.0, "num_tokens": 2545576.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8161136545240879, "epoch": 1.05859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2826439142227173, "learning_rate": 2.1526315789473686e-06, "loss": 0.0, "num_tokens": 2550104.0, "reward": 0.7566750049591064, "reward_std": 0.479504257440567, "rewards/reward_func/mean": 0.7566750049591064, "rewards/reward_func/std": 0.4439469873905182, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7691904827952385, "epoch": 1.060546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24437452852725983, "learning_rate": 2.1473684210526317e-06, "loss": -0.0, "num_tokens": 2554888.0, "reward": 0.8523874878883362, "reward_std": 0.23936279118061066, "rewards/reward_func/mean": 0.8523874878883362, "rewards/reward_func/std": 0.32976391911506653, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.33484411239624023, "epoch": 1.0625, "frac_reward_zero_std": 0.5, "grad_norm": 0.12194463610649109, "learning_rate": 2.142105263157895e-06, "loss": -0.0, "num_tokens": 2559392.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5565262641757727, "epoch": 1.064453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20515061914920807, "learning_rate": 2.136842105263158e-06, "loss": -0.0, "num_tokens": 2564152.0, "reward": 0.3663124740123749, "reward_std": 0.5069268345832825, "rewards/reward_func/mean": 0.36631250381469727, "rewards/reward_func/std": 0.4885818362236023, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3899686820805073, "epoch": 1.06640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19220681488513947, "learning_rate": 2.1315789473684212e-06, "loss": -0.0, "num_tokens": 2569088.0, "reward": 0.117249995470047, "reward_std": 0.2220594733953476, "rewards/reward_func/mean": 0.117249995470047, "rewards/reward_func/std": 0.3115631341934204, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5984954424202442, "epoch": 1.068359375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1464897096157074, "learning_rate": 2.1263157894736844e-06, "loss": 0.0, "num_tokens": 2574056.0, "reward": 0.00937500037252903, "reward_std": 0.010825317353010178, "rewards/reward_func/mean": 0.00937500037252903, "rewards/reward_func/std": 0.01735912822186947, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6083435006439686, "epoch": 1.0703125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.1210526315789476e-06, "loss": 0.0, "num_tokens": 2578800.0, "reward": 0.9399999976158142, "reward_std": 0.0, "rewards/reward_func/mean": 0.9399999976158142, "rewards/reward_func/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5468325912952423, "epoch": 1.072265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21403901278972626, "learning_rate": 2.1157894736842107e-06, "loss": 0.0, "num_tokens": 2583544.0, "reward": 0.5962874889373779, "reward_std": 0.5044821500778198, "rewards/reward_func/mean": 0.5962875485420227, "rewards/reward_func/std": 0.4841693639755249, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6861546561121941, "epoch": 1.07421875, "frac_reward_zero_std": 0.5, "grad_norm": 0.18041720986366272, "learning_rate": 2.110526315789474e-06, "loss": 0.0, "num_tokens": 2587976.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7639718391001225, "epoch": 1.076171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23512768745422363, "learning_rate": 2.105263157894737e-06, "loss": 0.0, "num_tokens": 2592728.0, "reward": 0.7156250476837158, "reward_std": 0.2827948331832886, "rewards/reward_func/mean": 0.715624988079071, "rewards/reward_func/std": 0.4345230460166931, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4462539367377758, "epoch": 1.078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.1933043748140335, "learning_rate": 2.1000000000000002e-06, "loss": -0.0, "num_tokens": 2597528.0, "reward": 0.2436874806880951, "reward_std": 0.47496649622917175, "rewards/reward_func/mean": 0.2436874955892563, "rewards/reward_func/std": 0.4397376477718353, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6553896479308605, "epoch": 1.080078125, "frac_reward_zero_std": 0.5, "grad_norm": 0.16326484084129333, "learning_rate": 2.0947368421052634e-06, "loss": 0.0, "num_tokens": 2602032.0, "reward": 0.8695999979972839, "reward_std": 0.24645258486270905, "rewards/reward_func/mean": 0.8695999979972839, "rewards/reward_func/std": 0.35150691866874695, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5182516658678651, "epoch": 1.08203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.18343044817447662, "learning_rate": 2.0894736842105266e-06, "loss": 0.0, "num_tokens": 2606832.0, "reward": 0.11836250126361847, "reward_std": 0.23672500252723694, "rewards/reward_func/mean": 0.11836250126361847, "rewards/reward_func/std": 0.3297579884529114, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7340024933218956, "epoch": 1.083984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24913115799427032, "learning_rate": 2.0842105263157897e-06, "loss": -0.0, "num_tokens": 2611360.0, "reward": 0.9920499920845032, "reward_std": 0.015900004655122757, "rewards/reward_func/mean": 0.9920499920845032, "rewards/reward_func/std": 0.015697769820690155, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6129337772727013, "epoch": 1.0859375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.078947368421053e-06, "loss": 0.0, "num_tokens": 2615704.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6931223534047604, "epoch": 1.087890625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 2.073684210526316e-06, "loss": 0.0, "num_tokens": 2620352.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7478494979441166, "epoch": 1.08984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24718572199344635, "learning_rate": 2.068421052631579e-06, "loss": 0.0, "num_tokens": 2625104.0, "reward": 0.9848999977111816, "reward_std": 0.013399453833699226, "rewards/reward_func/mean": 0.9848999977111816, "rewards/reward_func/std": 0.012781685218214989, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9691029265522957, "epoch": 1.091796875, "frac_reward_zero_std": 0.5, "grad_norm": 0.18628694117069244, "learning_rate": 2.0631578947368424e-06, "loss": -0.0, "num_tokens": 2629592.0, "reward": 0.754687488079071, "reward_std": 0.2833658754825592, "rewards/reward_func/mean": 0.754687488079071, "rewards/reward_func/std": 0.45434102416038513, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5964610986411572, "epoch": 1.09375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20570731163024902, "learning_rate": 2.0578947368421055e-06, "loss": -0.0, "num_tokens": 2634240.0, "reward": 0.24886250495910645, "reward_std": 0.28871649503707886, "rewards/reward_func/mean": 0.24886250495910645, "rewards/reward_func/std": 0.45700305700302124, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6072207950055599, "epoch": 1.095703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2261851280927658, "learning_rate": 2.0526315789473687e-06, "loss": 0.0, "num_tokens": 2638904.0, "reward": 0.6188000440597534, "reward_std": 0.5333460569381714, "rewards/reward_func/mean": 0.6187999844551086, "rewards/reward_func/std": 0.5125207901000977, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6318282932043076, "epoch": 1.09765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21229666471481323, "learning_rate": 2.047368421052632e-06, "loss": -0.0, "num_tokens": 2643640.0, "reward": 0.5874999761581421, "reward_std": 0.5063546299934387, "rewards/reward_func/mean": 0.5874999761581421, "rewards/reward_func/std": 0.486496239900589, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6489912085235119, "epoch": 1.099609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.25839194655418396, "learning_rate": 2.042105263157895e-06, "loss": 0.0, "num_tokens": 2648504.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8778778314590454, "epoch": 1.1015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.25570207834243774, "learning_rate": 2.036842105263158e-06, "loss": 0.0, "num_tokens": 2653248.0, "reward": 0.5106500387191772, "reward_std": 0.5463833808898926, "rewards/reward_func/mean": 0.5106500387191772, "rewards/reward_func/std": 0.5066616535186768, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6692373007535934, "epoch": 1.103515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20203210413455963, "learning_rate": 2.031578947368421e-06, "loss": -0.0, "num_tokens": 2658240.0, "reward": 0.23606249690055847, "reward_std": 0.4352036714553833, "rewards/reward_func/mean": 0.23606249690055847, "rewards/reward_func/std": 0.402925044298172, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.48891974054276943, "epoch": 1.10546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2103108912706375, "learning_rate": 2.026315789473684e-06, "loss": 0.0, "num_tokens": 2663024.0, "reward": 0.24681249260902405, "reward_std": 0.2817399799823761, "rewards/reward_func/mean": 0.24681249260902405, "rewards/reward_func/std": 0.43789422512054443, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.22187136486172676, "epoch": 1.107421875, "frac_reward_zero_std": 0.5, "grad_norm": 0.11180789768695831, "learning_rate": 2.0210526315789477e-06, "loss": 0.0, "num_tokens": 2667528.0, "reward": 0.6972000002861023, "reward_std": 0.2683524191379547, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6616909764707088, "epoch": 1.109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22444318234920502, "learning_rate": 2.015789473684211e-06, "loss": 0.0, "num_tokens": 2672320.0, "reward": 0.736412525177002, "reward_std": 0.4869075417518616, "rewards/reward_func/mean": 0.736412525177002, "rewards/reward_func/std": 0.4509141147136688, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5226791556924582, "epoch": 1.111328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.25233376026153564, "learning_rate": 2.010526315789474e-06, "loss": -0.0, "num_tokens": 2677112.0, "reward": 0.3647499680519104, "reward_std": 0.5094026923179626, "rewards/reward_func/mean": 0.3647499680519104, "rewards/reward_func/std": 0.4897109270095825, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7138075400143862, "epoch": 1.11328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22373870015144348, "learning_rate": 2.005263157894737e-06, "loss": 0.0, "num_tokens": 2681760.0, "reward": 0.49729999899864197, "reward_std": 0.5742666125297546, "rewards/reward_func/mean": 0.49729999899864197, "rewards/reward_func/std": 0.531683087348938, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49137804098427296, "epoch": 1.115234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.1849352866411209, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "num_tokens": 2686720.0, "reward": 0.015625, "reward_std": 0.025966878980398178, "rewards/reward_func/mean": 0.015625, "rewards/reward_func/std": 0.0265165064483881, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 1.0584421530365944, "epoch": 1.1171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.32097047567367554, "learning_rate": 1.994736842105263e-06, "loss": 0.0, "num_tokens": 2691216.0, "reward": 0.6211625337600708, "reward_std": 0.5293147563934326, "rewards/reward_func/mean": 0.6211625337600708, "rewards/reward_func/std": 0.511013925075531, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5815026629716158, "epoch": 1.119140625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1526409089565277, "learning_rate": 1.9894736842105262e-06, "loss": 0.0, "num_tokens": 2695880.0, "reward": 0.8765624761581421, "reward_std": 0.24687500298023224, "rewards/reward_func/mean": 0.8765624761581421, "rewards/reward_func/std": 0.34913399815559387, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.731554638594389, "epoch": 1.12109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23003970086574554, "learning_rate": 1.9842105263157894e-06, "loss": 0.0, "num_tokens": 2700208.0, "reward": 0.754687488079071, "reward_std": 0.4906249940395355, "rewards/reward_func/mean": 0.754687488079071, "rewards/reward_func/std": 0.45434102416038513, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.44338976964354515, "epoch": 1.123046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.1868741661310196, "learning_rate": 1.978947368421053e-06, "loss": -0.0, "num_tokens": 2704992.0, "reward": 0.3651750087738037, "reward_std": 0.5036294460296631, "rewards/reward_func/mean": 0.3651749789714813, "rewards/reward_func/std": 0.4835827052593231, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5494178477674723, "epoch": 1.125, "frac_reward_zero_std": 0.5, "grad_norm": 0.15477776527404785, "learning_rate": 1.973684210526316e-06, "loss": 0.0, "num_tokens": 2709976.0, "reward": 0.3361250162124634, "reward_std": 0.21575000882148743, "rewards/reward_func/mean": 0.3361250162124634, "rewards/reward_func/std": 0.4570740759372711, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.35547622572630644, "epoch": 1.126953125, "frac_reward_zero_std": 0.5, "grad_norm": 0.10672120004892349, "learning_rate": 1.9684210526315793e-06, "loss": 0.0, "num_tokens": 2714472.0, "reward": 0.9275000095367432, "reward_std": 0.0041999914683401585, "rewards/reward_func/mean": 0.9275000095367432, "rewards/reward_func/std": 0.005939692724496126, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6676280610263348, "epoch": 1.12890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21440531313419342, "learning_rate": 1.9631578947368425e-06, "loss": 0.0, "num_tokens": 2719128.0, "reward": 0.3832375109195709, "reward_std": 0.5218474268913269, "rewards/reward_func/mean": 0.3832375109195709, "rewards/reward_func/std": 0.5054107308387756, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5016189366579056, "epoch": 1.130859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20237448811531067, "learning_rate": 1.9578947368421052e-06, "loss": 0.0, "num_tokens": 2723792.0, "reward": 0.6060000061988831, "reward_std": 0.2554674744606018, "rewards/reward_func/mean": 0.6060000061988831, "rewards/reward_func/std": 0.502031147480011, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5959945041686296, "epoch": 1.1328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.33901241421699524, "learning_rate": 1.9526315789473684e-06, "loss": 0.0, "num_tokens": 2728560.0, "reward": 0.3585500121116638, "reward_std": 0.5105670690536499, "rewards/reward_func/mean": 0.3585500121116638, "rewards/reward_func/std": 0.4953995645046234, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.948068805038929, "epoch": 1.134765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.3101772665977478, "learning_rate": 1.9473684210526315e-06, "loss": -0.0, "num_tokens": 2733064.0, "reward": 0.6379250288009644, "reward_std": 0.512575626373291, "rewards/reward_func/mean": 0.6379250288009644, "rewards/reward_func/std": 0.494179368019104, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.46106173656880856, "epoch": 1.13671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18642479181289673, "learning_rate": 1.9421052631578947e-06, "loss": -0.0, "num_tokens": 2737984.0, "reward": 0.8615000247955322, "reward_std": 0.2508353888988495, "rewards/reward_func/mean": 0.8615000247955322, "rewards/reward_func/std": 0.3482353091239929, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.27936943667009473, "epoch": 1.138671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.1574077308177948, "learning_rate": 1.936842105263158e-06, "loss": 0.0, "num_tokens": 2742496.0, "reward": 0.6930000185966492, "reward_std": 0.4620679020881653, "rewards/reward_func/mean": 0.6929999589920044, "rewards/reward_func/std": 0.4277917444705963, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9888619035482407, "epoch": 1.140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.266496479511261, "learning_rate": 1.9315789473684215e-06, "loss": 0.0, "num_tokens": 2747016.0, "reward": 0.864799976348877, "reward_std": 0.2641477882862091, "rewards/reward_func/mean": 0.864799976348877, "rewards/reward_func/std": 0.35006821155548096, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7337736170738935, "epoch": 1.142578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23653343319892883, "learning_rate": 1.9263157894736846e-06, "loss": -0.0, "num_tokens": 2752000.0, "reward": 0.23918750882148743, "reward_std": 0.43328288197517395, "rewards/reward_func/mean": 0.23918750882148743, "rewards/reward_func/std": 0.4011473059654236, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5843038037419319, "epoch": 1.14453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21472285687923431, "learning_rate": 1.9210526315789474e-06, "loss": 0.0, "num_tokens": 2756976.0, "reward": 0.3330000042915344, "reward_std": 0.47834351658821106, "rewards/reward_func/mean": 0.3330000042915344, "rewards/reward_func/std": 0.45958366990089417, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.766981165856123, "epoch": 1.146484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.171160027384758, "learning_rate": 1.9157894736842105e-06, "loss": -0.0, "num_tokens": 2761464.0, "reward": 0.7562500238418579, "reward_std": 0.28164324164390564, "rewards/reward_func/mean": 0.7562500238418579, "rewards/reward_func/std": 0.45153507590293884, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5991237238049507, "epoch": 1.1484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23209227621555328, "learning_rate": 1.9105263157894737e-06, "loss": -0.0, "num_tokens": 2766240.0, "reward": 0.2514999806880951, "reward_std": 0.4700762629508972, "rewards/reward_func/mean": 0.2515000104904175, "rewards/reward_func/std": 0.4352715015411377, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4802367826923728, "epoch": 1.150390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20853254199028015, "learning_rate": 1.905263157894737e-06, "loss": -0.0, "num_tokens": 2771008.0, "reward": 0.26012498140335083, "reward_std": 0.47928479313850403, "rewards/reward_func/mean": 0.26012498140335083, "rewards/reward_func/std": 0.44427764415740967, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4567495686933398, "epoch": 1.15234375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1308472901582718, "learning_rate": 1.9000000000000002e-06, "loss": -0.0, "num_tokens": 2775512.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7382534444332123, "epoch": 1.154296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24290543794631958, "learning_rate": 1.8947368421052634e-06, "loss": 0.0, "num_tokens": 2780360.0, "reward": 0.23649999499320984, "reward_std": 0.4729999899864197, "rewards/reward_func/mean": 0.23649999499320984, "rewards/reward_func/std": 0.4379509389400482, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8535390049219131, "epoch": 1.15625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2762669026851654, "learning_rate": 1.8894736842105266e-06, "loss": 0.0, "num_tokens": 2784888.0, "reward": 0.7473000288009644, "reward_std": 0.4982522130012512, "rewards/reward_func/mean": 0.7473000288009644, "rewards/reward_func/std": 0.4613037705421448, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7388596646487713, "epoch": 1.158203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2849021852016449, "learning_rate": 1.8842105263157895e-06, "loss": 0.0, "num_tokens": 2789632.0, "reward": 0.7318500280380249, "reward_std": 0.48854929208755493, "rewards/reward_func/mean": 0.7318500280380249, "rewards/reward_func/std": 0.4523949921131134, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5478004105389118, "epoch": 1.16015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21340344846248627, "learning_rate": 1.8789473684210527e-06, "loss": -0.0, "num_tokens": 2794568.0, "reward": 0.17698749899864197, "reward_std": 0.23321816325187683, "rewards/reward_func/mean": 0.17698749899864197, "rewards/reward_func/std": 0.3240951597690582, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7563757076859474, "epoch": 1.162109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22627751529216766, "learning_rate": 1.8736842105263158e-06, "loss": 0.0, "num_tokens": 2799584.0, "reward": 0.24108749628067017, "reward_std": 0.46984440088272095, "rewards/reward_func/mean": 0.24108749628067017, "rewards/reward_func/std": 0.43506819009780884, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8104111216962337, "epoch": 1.1640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.258118599653244, "learning_rate": 1.868421052631579e-06, "loss": 0.0, "num_tokens": 2804328.0, "reward": 0.8609625101089478, "reward_std": 0.2564750015735626, "rewards/reward_func/mean": 0.8609625101089478, "rewards/reward_func/std": 0.3434082269668579, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.36982312239706516, "epoch": 1.166015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19538865983486176, "learning_rate": 1.8631578947368424e-06, "loss": -0.0, "num_tokens": 2808832.0, "reward": 0.6972000002861023, "reward_std": 0.46480000019073486, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6775441057980061, "epoch": 1.16796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2285778522491455, "learning_rate": 1.8578947368421055e-06, "loss": 0.0, "num_tokens": 2813488.0, "reward": 0.4920499920845032, "reward_std": 0.5683344602584839, "rewards/reward_func/mean": 0.4920499920845032, "rewards/reward_func/std": 0.5261891484260559, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5741296671330929, "epoch": 1.169921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20829658210277557, "learning_rate": 1.8526315789473687e-06, "loss": -0.0, "num_tokens": 2818272.0, "reward": 0.12574999034404755, "reward_std": 0.24323992431163788, "rewards/reward_func/mean": 0.12574999034404755, "rewards/reward_func/std": 0.33566170930862427, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.2949846936389804, "epoch": 1.171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.1604420691728592, "learning_rate": 1.8473684210526319e-06, "loss": -0.0, "num_tokens": 2822784.0, "reward": 0.6972000002861023, "reward_std": 0.46480000019073486, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.843192458152771, "epoch": 1.173828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2168247401714325, "learning_rate": 1.8421052631578948e-06, "loss": 0.0, "num_tokens": 2827528.0, "reward": 0.7124999761581421, "reward_std": 0.47541630268096924, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.44025155901908875, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7822651527822018, "epoch": 1.17578125, "frac_reward_zero_std": 0.5, "grad_norm": 0.18856878578662872, "learning_rate": 1.836842105263158e-06, "loss": 0.0, "num_tokens": 2831880.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6440773196518421, "epoch": 1.177734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23340384662151337, "learning_rate": 1.8315789473684211e-06, "loss": -0.0, "num_tokens": 2836528.0, "reward": 0.7204999923706055, "reward_std": 0.481002539396286, "rewards/reward_func/mean": 0.7204999923706055, "rewards/reward_func/std": 0.4453295171260834, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.576304204761982, "epoch": 1.1796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23746424913406372, "learning_rate": 1.8263157894736843e-06, "loss": -0.0, "num_tokens": 2841192.0, "reward": 0.24729999899864197, "reward_std": 0.49459999799728394, "rewards/reward_func/mean": 0.24729999899864197, "rewards/reward_func/std": 0.4579470157623291, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8262333199381828, "epoch": 1.181640625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1838967502117157, "learning_rate": 1.8210526315789475e-06, "loss": 0.0, "num_tokens": 2846184.0, "reward": 0.11140000075101852, "reward_std": 0.22280000150203705, "rewards/reward_func/mean": 0.11140000075101852, "rewards/reward_func/std": 0.3150867819786072, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7038808278739452, "epoch": 1.18359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22406964004039764, "learning_rate": 1.8157894736842109e-06, "loss": 0.0, "num_tokens": 2850784.0, "reward": 0.7308000326156616, "reward_std": 0.4872533679008484, "rewards/reward_func/mean": 0.7307999730110168, "rewards/reward_func/std": 0.45112112164497375, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8283609077334404, "epoch": 1.185546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2470514476299286, "learning_rate": 1.810526315789474e-06, "loss": -0.0, "num_tokens": 2855608.0, "reward": 0.7521250247955322, "reward_std": 0.4598061442375183, "rewards/reward_func/mean": 0.7521250247955322, "rewards/reward_func/std": 0.4257110357284546, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6406707651913166, "epoch": 1.1875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2593001425266266, "learning_rate": 1.805263157894737e-06, "loss": 0.0, "num_tokens": 2860792.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7551974728703499, "epoch": 1.189453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2569890022277832, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "num_tokens": 2865472.0, "reward": 0.635937511920929, "reward_std": 0.5168001055717468, "rewards/reward_func/mean": 0.635937511920929, "rewards/reward_func/std": 0.5031790733337402, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8159989677369595, "epoch": 1.19140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2728988826274872, "learning_rate": 1.7947368421052633e-06, "loss": 0.0, "num_tokens": 2869952.0, "reward": 0.6223000288009644, "reward_std": 0.536927342414856, "rewards/reward_func/mean": 0.6223000288009644, "rewards/reward_func/std": 0.5153651237487793, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5178343709558249, "epoch": 1.193359375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1404290795326233, "learning_rate": 1.7894736842105265e-06, "loss": 0.0, "num_tokens": 2874464.0, "reward": 0.6972000002861023, "reward_std": 0.2683524191379547, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5767571702599525, "epoch": 1.1953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2455584704875946, "learning_rate": 1.7842105263157896e-06, "loss": -0.0, "num_tokens": 2879264.0, "reward": 0.7221124768257141, "reward_std": 0.27265700697898865, "rewards/reward_func/mean": 0.7221125364303589, "rewards/reward_func/std": 0.4268017113208771, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.568201445043087, "epoch": 1.197265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2082817703485489, "learning_rate": 1.7789473684210528e-06, "loss": 0.0, "num_tokens": 2884240.0, "reward": 0.01875000074505806, "reward_std": 0.028509706258773804, "rewards/reward_func/mean": 0.01875000074505806, "rewards/reward_func/std": 0.026726124808192253, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6839352063834667, "epoch": 1.19921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18117974698543549, "learning_rate": 1.7736842105263162e-06, "loss": -0.0, "num_tokens": 2889224.0, "reward": 0.24700000882148743, "reward_std": 0.4290767312049866, "rewards/reward_func/mean": 0.24699999392032623, "rewards/reward_func/std": 0.39724788069725037, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6872431337833405, "epoch": 1.201171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2561533749103546, "learning_rate": 1.768421052631579e-06, "loss": 0.0, "num_tokens": 2893976.0, "reward": 0.49361249804496765, "reward_std": 0.4846278727054596, "rewards/reward_func/mean": 0.49361249804496765, "rewards/reward_func/std": 0.5245352387428284, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5922851376235485, "epoch": 1.203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20768021047115326, "learning_rate": 1.7631578947368423e-06, "loss": -0.0, "num_tokens": 2898728.0, "reward": 0.3634375035762787, "reward_std": 0.4954363703727722, "rewards/reward_func/mean": 0.3634375035762787, "rewards/reward_func/std": 0.4777008891105652, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5938981436192989, "epoch": 1.205078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20559649169445038, "learning_rate": 1.7578947368421054e-06, "loss": 0.0, "num_tokens": 2903416.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8055331483483315, "epoch": 1.20703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.26412415504455566, "learning_rate": 1.7526315789473686e-06, "loss": -0.0, "num_tokens": 2907952.0, "reward": 0.7447500228881836, "reward_std": 0.2991751432418823, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7401883006095886, "epoch": 1.208984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2682240605354309, "learning_rate": 1.7473684210526318e-06, "loss": -0.0, "num_tokens": 2912488.0, "reward": 0.7504249811172485, "reward_std": 0.49200356006622314, "rewards/reward_func/mean": 0.7504249811172485, "rewards/reward_func/std": 0.4555671811103821, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5817705541849136, "epoch": 1.2109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20313231647014618, "learning_rate": 1.742105263157895e-06, "loss": 0.0, "num_tokens": 2917136.0, "reward": 0.23980000615119934, "reward_std": 0.4796000123023987, "rewards/reward_func/mean": 0.23980000615119934, "rewards/reward_func/std": 0.4441419541835785, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5425855070352554, "epoch": 1.212890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24401021003723145, "learning_rate": 1.736842105263158e-06, "loss": 0.0, "num_tokens": 2922088.0, "reward": 0.23787501454353333, "reward_std": 0.4510968029499054, "rewards/reward_func/mean": 0.23787499964237213, "rewards/reward_func/std": 0.4178653955459595, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.43251657113432884, "epoch": 1.21484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2256747931241989, "learning_rate": 1.731578947368421e-06, "loss": -0.0, "num_tokens": 2927016.0, "reward": 0.38294997811317444, "reward_std": 0.5065792798995972, "rewards/reward_func/mean": 0.38294997811317444, "rewards/reward_func/std": 0.487602561712265, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5035722590982914, "epoch": 1.216796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20219974219799042, "learning_rate": 1.7263157894736842e-06, "loss": -0.0, "num_tokens": 2931856.0, "reward": 0.7478624582290649, "reward_std": 0.46107497811317444, "rewards/reward_func/mean": 0.7478624582290649, "rewards/reward_func/std": 0.42688557505607605, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6267634443938732, "epoch": 1.21875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2189279943704605, "learning_rate": 1.7210526315789474e-06, "loss": 0.0, "num_tokens": 2936608.0, "reward": 0.7312500476837158, "reward_std": 0.4379579424858093, "rewards/reward_func/mean": 0.7312500476837158, "rewards/reward_func/std": 0.40563005208969116, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3103468529880047, "epoch": 1.220703125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.7157894736842107e-06, "loss": 0.0, "num_tokens": 2941112.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47227631881833076, "epoch": 1.22265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22579093277454376, "learning_rate": 1.710526315789474e-06, "loss": -0.0, "num_tokens": 2945904.0, "reward": 0.36162498593330383, "reward_std": 0.5129633545875549, "rewards/reward_func/mean": 0.36162498593330383, "rewards/reward_func/std": 0.4922618269920349, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.2546942033804953, "epoch": 1.224609375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.705263157894737e-06, "loss": 0.0, "num_tokens": 2950400.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6319939978420734, "epoch": 1.2265625, "frac_reward_zero_std": 0.5, "grad_norm": 0.14794784784317017, "learning_rate": 1.7000000000000002e-06, "loss": -0.0, "num_tokens": 2955392.0, "reward": 0.2150000035762787, "reward_std": 0.24852363765239716, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.39838388562202454, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.716876182705164, "epoch": 1.228515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2454557716846466, "learning_rate": 1.6947368421052632e-06, "loss": 0.0, "num_tokens": 2960184.0, "reward": 0.3511999845504761, "reward_std": 0.5037546157836914, "rewards/reward_func/mean": 0.35120001435279846, "rewards/reward_func/std": 0.484712690114975, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.35377194359898567, "epoch": 1.23046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19669418036937714, "learning_rate": 1.6894736842105264e-06, "loss": -0.0, "num_tokens": 2964688.0, "reward": 0.5809999704360962, "reward_std": 0.5007524490356445, "rewards/reward_func/mean": 0.5809999704360962, "rewards/reward_func/std": 0.4811137020587921, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5233113840222359, "epoch": 1.232421875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.6842105263157895e-06, "loss": 0.0, "num_tokens": 2969024.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49619767256081104, "epoch": 1.234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18364404141902924, "learning_rate": 1.6789473684210527e-06, "loss": 0.0, "num_tokens": 2974000.0, "reward": 0.2407499998807907, "reward_std": 0.26884350180625916, "rewards/reward_func/mean": 0.2407500147819519, "rewards/reward_func/std": 0.4001612961292267, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6544572822749615, "epoch": 1.236328125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14807765185832977, "learning_rate": 1.673684210526316e-06, "loss": 0.0, "num_tokens": 2979000.0, "reward": 0.23649999499320984, "reward_std": 0.27312225103378296, "rewards/reward_func/mean": 0.23649999499320984, "rewards/reward_func/std": 0.4379509389400482, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5549633167684078, "epoch": 1.23828125, "frac_reward_zero_std": 0.5, "grad_norm": 0.12250755727291107, "learning_rate": 1.6684210526315792e-06, "loss": 0.0, "num_tokens": 2984016.0, "reward": 0.11140000075101852, "reward_std": 0.22280000150203705, "rewards/reward_func/mean": 0.11140000075101852, "rewards/reward_func/std": 0.3150867819786072, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6770503744482994, "epoch": 1.240234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21593227982521057, "learning_rate": 1.6631578947368424e-06, "loss": 0.0, "num_tokens": 2988672.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.569544143974781, "epoch": 1.2421875, "frac_reward_zero_std": 0.5, "grad_norm": 0.14851713180541992, "learning_rate": 1.6578947368421053e-06, "loss": 0.0, "num_tokens": 2993032.0, "reward": 0.878125011920929, "reward_std": 0.24375002086162567, "rewards/reward_func/mean": 0.878125011920929, "rewards/reward_func/std": 0.34471458196640015, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7452700100839138, "epoch": 1.244140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23980359733104706, "learning_rate": 1.6526315789473685e-06, "loss": -0.0, "num_tokens": 2998208.0, "reward": 0.2562499940395355, "reward_std": 0.3011751174926758, "rewards/reward_func/mean": 0.2562499940395355, "rewards/reward_func/std": 0.4593765139579773, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5180889405310154, "epoch": 1.24609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2281617522239685, "learning_rate": 1.6473684210526317e-06, "loss": 0.0, "num_tokens": 3003064.0, "reward": 0.48240000009536743, "reward_std": 0.5435653924942017, "rewards/reward_func/mean": 0.48240000009536743, "rewards/reward_func/std": 0.5036382079124451, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5895957332104445, "epoch": 1.248046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.16368906199932098, "learning_rate": 1.6421052631578948e-06, "loss": 0.0, "num_tokens": 3007408.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.713409535586834, "epoch": 1.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.2973420023918152, "learning_rate": 1.636842105263158e-06, "loss": 0.0, "num_tokens": 3012280.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6381527129560709, "epoch": 1.251953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20075997710227966, "learning_rate": 1.6315789473684212e-06, "loss": 0.0, "num_tokens": 3017240.0, "reward": 0.23475000262260437, "reward_std": 0.4529085159301758, "rewards/reward_func/mean": 0.23475000262260437, "rewards/reward_func/std": 0.41961437463760376, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5148098133504391, "epoch": 1.25390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20074260234832764, "learning_rate": 1.6263157894736845e-06, "loss": 0.0, "num_tokens": 3022208.0, "reward": 0.22257500886917114, "reward_std": 0.26535123586654663, "rewards/reward_func/mean": 0.22257500886917114, "rewards/reward_func/std": 0.39029690623283386, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7974681630730629, "epoch": 1.255859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24957697093486786, "learning_rate": 1.6210526315789473e-06, "loss": -0.0, "num_tokens": 3026848.0, "reward": 0.48893749713897705, "reward_std": 0.4796658158302307, "rewards/reward_func/mean": 0.48893749713897705, "rewards/reward_func/std": 0.512710690498352, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6741067506372929, "epoch": 1.2578125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1625678539276123, "learning_rate": 1.6157894736842106e-06, "loss": -0.0, "num_tokens": 3031184.0, "reward": 0.9947500228881836, "reward_std": 0.010500003583729267, "rewards/reward_func/mean": 0.9947500228881836, "rewards/reward_func/std": 0.014849240891635418, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6193629987537861, "epoch": 1.259765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23957446217536926, "learning_rate": 1.6105263157894738e-06, "loss": 0.0, "num_tokens": 3035928.0, "reward": 0.6013749837875366, "reward_std": 0.5029712319374084, "rewards/reward_func/mean": 0.6013749837875366, "rewards/reward_func/std": 0.4784298539161682, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8137162700295448, "epoch": 1.26171875, "frac_reward_zero_std": 0.5, "grad_norm": 0.14833250641822815, "learning_rate": 1.605263157894737e-06, "loss": -0.0, "num_tokens": 3040408.0, "reward": 0.879687488079071, "reward_std": 0.24062499403953552, "rewards/reward_func/mean": 0.879687488079071, "rewards/reward_func/std": 0.34029513597488403, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6442223079502583, "epoch": 1.263671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2811136841773987, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "num_tokens": 3045264.0, "reward": 0.350600004196167, "reward_std": 0.5062717199325562, "rewards/reward_func/mean": 0.350600004196167, "rewards/reward_func/std": 0.4842973053455353, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3350534597411752, "epoch": 1.265625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1243121474981308, "learning_rate": 1.5947368421052633e-06, "loss": -0.0, "num_tokens": 3049752.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.575018860399723, "epoch": 1.267578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22247184813022614, "learning_rate": 1.5894736842105265e-06, "loss": 0.0, "num_tokens": 3054392.0, "reward": 0.5039750337600708, "reward_std": 0.5604059100151062, "rewards/reward_func/mean": 0.5039750337600708, "rewards/reward_func/std": 0.5189851522445679, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3880303241312504, "epoch": 1.26953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.1775999665260315, "learning_rate": 1.5842105263157894e-06, "loss": 0.0, "num_tokens": 3059368.0, "reward": 0.23032499849796295, "reward_std": 0.2687353491783142, "rewards/reward_func/mean": 0.23032499849796295, "rewards/reward_func/std": 0.4190002381801605, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4622784499078989, "epoch": 1.271484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1834346204996109, "learning_rate": 1.5789473684210526e-06, "loss": -0.0, "num_tokens": 3063872.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8611919060349464, "epoch": 1.2734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3340691924095154, "learning_rate": 1.573684210526316e-06, "loss": 0.0, "num_tokens": 3068736.0, "reward": 0.7515624761581421, "reward_std": 0.49687498807907104, "rewards/reward_func/mean": 0.7515624761581421, "rewards/reward_func/std": 0.4600290060043335, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6393865756690502, "epoch": 1.275390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22418461740016937, "learning_rate": 1.5684210526315791e-06, "loss": 0.0, "num_tokens": 3073368.0, "reward": 0.4921249747276306, "reward_std": 0.5610401630401611, "rewards/reward_func/mean": 0.492125004529953, "rewards/reward_func/std": 0.519443690776825, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5920063890516758, "epoch": 1.27734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.26118144392967224, "learning_rate": 1.5631578947368423e-06, "loss": 0.0, "num_tokens": 3078168.0, "reward": 0.617437481880188, "reward_std": 0.25803983211517334, "rewards/reward_func/mean": 0.617437481880188, "rewards/reward_func/std": 0.4884096086025238, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6909713298082352, "epoch": 1.279296875, "frac_reward_zero_std": 0.5, "grad_norm": 0.18258942663669586, "learning_rate": 1.5578947368421054e-06, "loss": 0.0, "num_tokens": 3083008.0, "reward": 0.11959999799728394, "reward_std": 0.23919998109340668, "rewards/reward_func/mean": 0.11959999799728394, "rewards/reward_func/std": 0.3382799029350281, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.657719861716032, "epoch": 1.28125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1802099496126175, "learning_rate": 1.5526315789473686e-06, "loss": 0.0, "num_tokens": 3087856.0, "reward": 0.11620000004768372, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.11620000004768372, "rewards/reward_func/std": 0.32866325974464417, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6168779209256172, "epoch": 1.283203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1258898824453354, "learning_rate": 1.5473684210526316e-06, "loss": -0.0, "num_tokens": 3092488.0, "reward": 0.625, "reward_std": 0.25, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6636889725923538, "epoch": 1.28515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2637583911418915, "learning_rate": 1.5421052631578947e-06, "loss": 0.0, "num_tokens": 3097240.0, "reward": 0.8300000429153442, "reward_std": 0.25, "rewards/reward_func/mean": 0.8300000429153442, "rewards/reward_func/std": 0.33602720499038696, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8549417704343796, "epoch": 1.287109375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5368421052631579e-06, "loss": 0.0, "num_tokens": 3101800.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5893873460590839, "epoch": 1.2890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.22459273040294647, "learning_rate": 1.5315789473684213e-06, "loss": -0.0, "num_tokens": 3106536.0, "reward": 0.598437488079071, "reward_std": 0.4938383102416992, "rewards/reward_func/mean": 0.5984375476837158, "rewards/reward_func/std": 0.47170084714889526, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7923720888793468, "epoch": 1.291015625, "frac_reward_zero_std": 0.5, "grad_norm": 0.18553104996681213, "learning_rate": 1.5263157894736844e-06, "loss": 0.0, "num_tokens": 3111304.0, "reward": 0.8560999631881714, "reward_std": 0.24459999799728394, "rewards/reward_func/mean": 0.8560999631881714, "rewards/reward_func/std": 0.34591665863990784, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.46678472869098186, "epoch": 1.29296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.256147563457489, "learning_rate": 1.5210526315789476e-06, "loss": 0.0, "num_tokens": 3116096.0, "reward": 0.5954375267028809, "reward_std": 0.48662465810775757, "rewards/reward_func/mean": 0.5954374670982361, "rewards/reward_func/std": 0.4691931903362274, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7971262335777283, "epoch": 1.294921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2475176900625229, "learning_rate": 1.5157894736842108e-06, "loss": -0.0, "num_tokens": 3120744.0, "reward": 0.5011875033378601, "reward_std": 0.5680569410324097, "rewards/reward_func/mean": 0.5011875033378601, "rewards/reward_func/std": 0.5259928107261658, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4731484353542328, "epoch": 1.296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18087652325630188, "learning_rate": 1.5105263157894737e-06, "loss": 0.0, "num_tokens": 3125712.0, "reward": 0.11857499927282333, "reward_std": 0.2205643504858017, "rewards/reward_func/mean": 0.11857499927282333, "rewards/reward_func/std": 0.3053269386291504, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7882614396512508, "epoch": 1.298828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2276054322719574, "learning_rate": 1.5052631578947369e-06, "loss": -0.0, "num_tokens": 3130712.0, "reward": 0.4634000062942505, "reward_std": 0.4610704183578491, "rewards/reward_func/mean": 0.4634000062942505, "rewards/reward_func/std": 0.4954361617565155, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6726894378662109, "epoch": 1.30078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.25436362624168396, "learning_rate": 1.5e-06, "loss": 0.0, "num_tokens": 3135368.0, "reward": 0.6215875148773193, "reward_std": 0.5241646766662598, "rewards/reward_func/mean": 0.6215875148773193, "rewards/reward_func/std": 0.5046280026435852, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7788518108427525, "epoch": 1.302734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2387314885854721, "learning_rate": 1.4947368421052632e-06, "loss": 0.0, "num_tokens": 3140104.0, "reward": 0.721875011920929, "reward_std": 0.2759787440299988, "rewards/reward_func/mean": 0.721875011920929, "rewards/reward_func/std": 0.42338719964027405, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6936579272150993, "epoch": 1.3046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21527528762817383, "learning_rate": 1.4894736842105264e-06, "loss": 0.0, "num_tokens": 3145096.0, "reward": 0.23100000619888306, "reward_std": 0.4619999825954437, "rewards/reward_func/mean": 0.23100000619888306, "rewards/reward_func/std": 0.42808806896209717, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.649364996701479, "epoch": 1.306640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.25936535000801086, "learning_rate": 1.4842105263157897e-06, "loss": -0.0, "num_tokens": 3149888.0, "reward": 0.8514000177383423, "reward_std": 0.2537573277950287, "rewards/reward_func/mean": 0.8514000177383423, "rewards/reward_func/std": 0.3446618318557739, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7287271581590176, "epoch": 1.30859375, "frac_reward_zero_std": 0.5, "grad_norm": 0.17086577415466309, "learning_rate": 1.478947368421053e-06, "loss": 0.0, "num_tokens": 3154360.0, "reward": 0.7593749761581421, "reward_std": 0.27784982323646545, "rewards/reward_func/mean": 0.7593749761581421, "rewards/reward_func/std": 0.44555091857910156, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47401850298047066, "epoch": 1.310546875, "frac_reward_zero_std": 0.5, "grad_norm": 0.17154361307621002, "learning_rate": 1.4736842105263159e-06, "loss": 0.0, "num_tokens": 3159040.0, "reward": 0.8828125, "reward_std": 0.234375, "rewards/reward_func/mean": 0.8828125, "rewards/reward_func/std": 0.3314563035964966, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5196984093636274, "epoch": 1.3125, "frac_reward_zero_std": 0.0, "grad_norm": 0.185403510928154, "learning_rate": 1.468421052631579e-06, "loss": -0.0, "num_tokens": 3164008.0, "reward": 0.012500000186264515, "reward_std": 0.016983473673462868, "rewards/reward_func/mean": 0.012500000186264515, "rewards/reward_func/std": 0.021128857508301735, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3447682708501816, "epoch": 1.314453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.18098026514053345, "learning_rate": 1.4631578947368422e-06, "loss": -0.0, "num_tokens": 3168520.0, "reward": 0.6972000002861023, "reward_std": 0.46480000019073486, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5853043086826801, "epoch": 1.31640625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.4578947368421053e-06, "loss": 0.0, "num_tokens": 3172856.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6624711155891418, "epoch": 1.318359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24813523888587952, "learning_rate": 1.4526315789473685e-06, "loss": 0.0, "num_tokens": 3177696.0, "reward": 0.7400499582290649, "reward_std": 0.4767000079154968, "rewards/reward_func/mean": 0.7400499582290649, "rewards/reward_func/std": 0.4415406882762909, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8206646218895912, "epoch": 1.3203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2409617155790329, "learning_rate": 1.4473684210526317e-06, "loss": -0.0, "num_tokens": 3182448.0, "reward": 0.8735250234603882, "reward_std": 0.23993384838104248, "rewards/reward_func/mean": 0.8735250234603882, "rewards/reward_func/std": 0.32304173707962036, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6698844209313393, "epoch": 1.322265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.225163534283638, "learning_rate": 1.442105263157895e-06, "loss": -0.0, "num_tokens": 3187224.0, "reward": 0.3618749976158142, "reward_std": 0.4973260462284088, "rewards/reward_func/mean": 0.3618749976158142, "rewards/reward_func/std": 0.4790313243865967, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5397326983511448, "epoch": 1.32421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.17920690774917603, "learning_rate": 1.4368421052631582e-06, "loss": -0.0, "num_tokens": 3191880.0, "reward": 0.49943751096725464, "reward_std": 0.49373185634613037, "rewards/reward_func/mean": 0.49943751096725464, "rewards/reward_func/std": 0.5242229104042053, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6977795027196407, "epoch": 1.326171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.26054197549819946, "learning_rate": 1.4315789473684212e-06, "loss": 0.0, "num_tokens": 3196416.0, "reward": 0.7473000288009644, "reward_std": 0.4982522130012512, "rewards/reward_func/mean": 0.7473000288009644, "rewards/reward_func/std": 0.4613037705421448, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7697173990309238, "epoch": 1.328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.26796966791152954, "learning_rate": 1.4263157894736843e-06, "loss": 0.0, "num_tokens": 3201416.0, "reward": 0.2418999969959259, "reward_std": 0.4837999939918518, "rewards/reward_func/mean": 0.2418999969959259, "rewards/reward_func/std": 0.44794899225234985, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5212496444582939, "epoch": 1.330078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21516187489032745, "learning_rate": 1.4210526315789475e-06, "loss": -0.0, "num_tokens": 3206208.0, "reward": 0.24524998664855957, "reward_std": 0.4739798605442047, "rewards/reward_func/mean": 0.24524998664855957, "rewards/reward_func/std": 0.43882009387016296, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.694312822073698, "epoch": 1.33203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.16465605795383453, "learning_rate": 1.4157894736842107e-06, "loss": 0.0, "num_tokens": 3210816.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6481199897825718, "epoch": 1.333984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23585893213748932, "learning_rate": 1.4105263157894738e-06, "loss": 0.0, "num_tokens": 3215672.0, "reward": 0.7447500228881836, "reward_std": 0.4966987073421478, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7410243712365627, "epoch": 1.3359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24909096956253052, "learning_rate": 1.405263157894737e-06, "loss": -0.0, "num_tokens": 3220472.0, "reward": 0.23709999024868011, "reward_std": 0.4575529098510742, "rewards/reward_func/mean": 0.2371000051498413, "rewards/reward_func/std": 0.42398691177368164, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.34077938739210367, "epoch": 1.337890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.1768733263015747, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "num_tokens": 3224976.0, "reward": 0.8226249814033508, "reward_std": 0.2383500039577484, "rewards/reward_func/mean": 0.8226249814033508, "rewards/reward_func/std": 0.3227412700653076, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5411074692383409, "epoch": 1.33984375, "frac_reward_zero_std": 0.5, "grad_norm": 0.11871396750211716, "learning_rate": 1.394736842105263e-06, "loss": 0.0, "num_tokens": 3229472.0, "reward": 0.8113000392913818, "reward_std": 0.23103395104408264, "rewards/reward_func/mean": 0.8113000392913818, "rewards/reward_func/std": 0.32786741852760315, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.45428065955638885, "epoch": 1.341796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.1815212070941925, "learning_rate": 1.3894736842105263e-06, "loss": 0.0, "num_tokens": 3234432.0, "reward": 0.43700000643730164, "reward_std": 0.4397338032722473, "rewards/reward_func/mean": 0.43700000643730164, "rewards/reward_func/std": 0.46753212809562683, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5205916427075863, "epoch": 1.34375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21481692790985107, "learning_rate": 1.3842105263157896e-06, "loss": -0.0, "num_tokens": 3239224.0, "reward": 0.13824999332427979, "reward_std": 0.24868974089622498, "rewards/reward_func/mean": 0.13824999332427979, "rewards/reward_func/std": 0.33114388585090637, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47748175263404846, "epoch": 1.345703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.1996050328016281, "learning_rate": 1.3789473684210528e-06, "loss": -0.0, "num_tokens": 3244008.0, "reward": 0.3589249849319458, "reward_std": 0.509075939655304, "rewards/reward_func/mean": 0.3589249849319458, "rewards/reward_func/std": 0.4885358214378357, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7458081506192684, "epoch": 1.34765625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1602766215801239, "learning_rate": 1.373684210526316e-06, "loss": -0.0, "num_tokens": 3248512.0, "reward": 0.8754249811172485, "reward_std": 0.24200356006622314, "rewards/reward_func/mean": 0.8754249811172485, "rewards/reward_func/std": 0.34370672702789307, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5473680794239044, "epoch": 1.349609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2388388216495514, "learning_rate": 1.3684210526315791e-06, "loss": 0.0, "num_tokens": 3252872.0, "reward": 0.515625, "reward_std": 0.4859190583229065, "rewards/reward_func/mean": 0.515625, "rewards/reward_func/std": 0.5182280540466309, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3880773186683655, "epoch": 1.3515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21450631320476532, "learning_rate": 1.3631578947368423e-06, "loss": 0.0, "num_tokens": 3257656.0, "reward": 0.5974999666213989, "reward_std": 0.5149734020233154, "rewards/reward_func/mean": 0.5974999666213989, "rewards/reward_func/std": 0.49477702379226685, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5613982677459717, "epoch": 1.353515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20392994582653046, "learning_rate": 1.3578947368421052e-06, "loss": 0.0, "num_tokens": 3262312.0, "reward": 0.7505750060081482, "reward_std": 0.4759466052055359, "rewards/reward_func/mean": 0.7505750060081482, "rewards/reward_func/std": 0.44124478101730347, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6902660578489304, "epoch": 1.35546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2161121368408203, "learning_rate": 1.3526315789473684e-06, "loss": -0.0, "num_tokens": 3267328.0, "reward": 0.5890499949455261, "reward_std": 0.5059839487075806, "rewards/reward_func/mean": 0.5890499949455261, "rewards/reward_func/std": 0.4884392023086548, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5899221636354923, "epoch": 1.357421875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3473684210526316e-06, "loss": 0.0, "num_tokens": 3272280.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9027910940349102, "epoch": 1.359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.25422924757003784, "learning_rate": 1.342105263157895e-06, "loss": 0.0, "num_tokens": 3276816.0, "reward": 0.5062500238418579, "reward_std": 0.4917367100715637, "rewards/reward_func/mean": 0.5062500238418579, "rewards/reward_func/std": 0.5279255509376526, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6606407798826694, "epoch": 1.361328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2949965000152588, "learning_rate": 1.3368421052631581e-06, "loss": -0.0, "num_tokens": 3281448.0, "reward": 0.7399374842643738, "reward_std": 0.46412500739097595, "rewards/reward_func/mean": 0.7399374842643738, "rewards/reward_func/std": 0.43033212423324585, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6454514786601067, "epoch": 1.36328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2416992336511612, "learning_rate": 1.3315789473684213e-06, "loss": 0.0, "num_tokens": 3285792.0, "reward": 0.762499988079071, "reward_std": 0.4749999940395355, "rewards/reward_func/mean": 0.762499988079071, "rewards/reward_func/std": 0.4397645592689514, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6855951938778162, "epoch": 1.365234375, "frac_reward_zero_std": 0.5, "grad_norm": 0.15963053703308105, "learning_rate": 1.3263157894736844e-06, "loss": 0.0, "num_tokens": 3290760.0, "reward": 0.11100000143051147, "reward_std": 0.22200000286102295, "rewards/reward_func/mean": 0.11100000143051147, "rewards/reward_func/std": 0.3139553964138031, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7190686650574207, "epoch": 1.3671875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1730789840221405, "learning_rate": 1.3210526315789474e-06, "loss": 0.0, "num_tokens": 3295248.0, "reward": 0.8765624761581421, "reward_std": 0.24687500298023224, "rewards/reward_func/mean": 0.8765624761581421, "rewards/reward_func/std": 0.34913399815559387, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6450852937996387, "epoch": 1.369140625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.3157894736842106e-06, "loss": 0.0, "num_tokens": 3299880.0, "reward": 0.972000002861023, "reward_std": 0.0, "rewards/reward_func/mean": 0.972000002861023, "rewards/reward_func/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8256732821464539, "epoch": 1.37109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2364703118801117, "learning_rate": 1.3105263157894737e-06, "loss": 0.0, "num_tokens": 3304512.0, "reward": 0.8487499952316284, "reward_std": 0.24650000035762787, "rewards/reward_func/mean": 0.8487499952316284, "rewards/reward_func/std": 0.3429817855358124, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6302417553961277, "epoch": 1.373046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1295423060655594, "learning_rate": 1.3052631578947369e-06, "loss": 0.0, "num_tokens": 3309296.0, "reward": 0.11749999970197678, "reward_std": 0.23499999940395355, "rewards/reward_func/mean": 0.11749999970197678, "rewards/reward_func/std": 0.33234018087387085, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.957762099802494, "epoch": 1.375, "frac_reward_zero_std": 0.5, "grad_norm": 0.18025648593902588, "learning_rate": 1.3e-06, "loss": 0.0, "num_tokens": 3314128.0, "reward": 0.11959999799728394, "reward_std": 0.23919998109340668, "rewards/reward_func/mean": 0.11959999799728394, "rewards/reward_func/std": 0.3382799029350281, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7645817883312702, "epoch": 1.376953125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1874340921640396, "learning_rate": 1.2947368421052634e-06, "loss": -0.0, "num_tokens": 3318736.0, "reward": 0.7447500228881836, "reward_std": 0.2827429473400116, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.732977319508791, "epoch": 1.37890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23621387779712677, "learning_rate": 1.2894736842105266e-06, "loss": -0.0, "num_tokens": 3323384.0, "reward": 0.8697500228881836, "reward_std": 0.2605000138282776, "rewards/reward_func/mean": 0.8697500228881836, "rewards/reward_func/std": 0.3517392575740814, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8189778625965118, "epoch": 1.380859375, "frac_reward_zero_std": 0.5, "grad_norm": 0.16602914035320282, "learning_rate": 1.2842105263157895e-06, "loss": -0.0, "num_tokens": 3327856.0, "reward": 0.9947500228881836, "reward_std": 0.010500003583729267, "rewards/reward_func/mean": 0.9947500228881836, "rewards/reward_func/std": 0.014849240891635418, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5835609417408705, "epoch": 1.3828125, "frac_reward_zero_std": 0.5, "grad_norm": 0.20208346843719482, "learning_rate": 1.2789473684210527e-06, "loss": 0.0, "num_tokens": 3332536.0, "reward": 0.635937511920929, "reward_std": 0.24358300864696503, "rewards/reward_func/mean": 0.635937511920929, "rewards/reward_func/std": 0.5031790733337402, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3960736747831106, "epoch": 1.384765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19159826636314392, "learning_rate": 1.2736842105263159e-06, "loss": 0.0, "num_tokens": 3337040.0, "reward": 0.8113000392913818, "reward_std": 0.23659999668598175, "rewards/reward_func/mean": 0.8113000392913818, "rewards/reward_func/std": 0.32786741852760315, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7697632871568203, "epoch": 1.38671875, "frac_reward_zero_std": 0.5, "grad_norm": 0.16308048367500305, "learning_rate": 1.268421052631579e-06, "loss": 0.0, "num_tokens": 3342048.0, "reward": 0.12229999899864197, "reward_std": 0.24459999799728394, "rewards/reward_func/mean": 0.12229999899864197, "rewards/reward_func/std": 0.34591662883758545, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6840203031897545, "epoch": 1.388671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23875313997268677, "learning_rate": 1.2631578947368422e-06, "loss": 0.0, "num_tokens": 3346576.0, "reward": 0.7488625049591064, "reward_std": 0.4951278567314148, "rewards/reward_func/mean": 0.7488625049591064, "rewards/reward_func/std": 0.4584231972694397, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6921983473002911, "epoch": 1.390625, "frac_reward_zero_std": 0.5, "grad_norm": 0.21332110464572906, "learning_rate": 1.2578947368421054e-06, "loss": 0.0, "num_tokens": 3351320.0, "reward": 0.47312501072883606, "reward_std": 0.0062500000931322575, "rewards/reward_func/mean": 0.47312498092651367, "rewards/reward_func/std": 0.4991774260997772, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7641146704554558, "epoch": 1.392578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21671804785728455, "learning_rate": 1.2526315789473687e-06, "loss": 0.0, "num_tokens": 3356000.0, "reward": 0.8525000214576721, "reward_std": 0.25541630387306213, "rewards/reward_func/mean": 0.8525000214576721, "rewards/reward_func/std": 0.34573936462402344, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8032694086432457, "epoch": 1.39453125, "frac_reward_zero_std": 0.5, "grad_norm": 0.2227100133895874, "learning_rate": 1.2473684210526317e-06, "loss": 0.0, "num_tokens": 3360616.0, "reward": 0.8529999852180481, "reward_std": 0.2427220493555069, "rewards/reward_func/mean": 0.8529999852180481, "rewards/reward_func/std": 0.3447500169277191, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.20121474005281925, "epoch": 1.396484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.11205914616584778, "learning_rate": 1.2421052631578948e-06, "loss": -0.0, "num_tokens": 3365096.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7984618470072746, "epoch": 1.3984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23668164014816284, "learning_rate": 1.236842105263158e-06, "loss": -0.0, "num_tokens": 3369616.0, "reward": 0.7487249970436096, "reward_std": 0.28503718972206116, "rewards/reward_func/mean": 0.7487249970436096, "rewards/reward_func/std": 0.4394227862358093, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6621723957359791, "epoch": 1.400390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21383367478847504, "learning_rate": 1.2315789473684212e-06, "loss": -0.0, "num_tokens": 3374384.0, "reward": 0.5983499884605408, "reward_std": 0.5136319994926453, "rewards/reward_func/mean": 0.5983499884605408, "rewards/reward_func/std": 0.4956625998020172, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6606850810348988, "epoch": 1.40234375, "frac_reward_zero_std": 0.5, "grad_norm": 0.17316952347755432, "learning_rate": 1.2263157894736843e-06, "loss": 0.0, "num_tokens": 3379384.0, "reward": 0.11959999799728394, "reward_std": 0.23919998109340668, "rewards/reward_func/mean": 0.11959999799728394, "rewards/reward_func/std": 0.3382799029350281, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.23659847397357225, "epoch": 1.404296875, "frac_reward_zero_std": 0.5, "grad_norm": 0.13336242735385895, "learning_rate": 1.2210526315789475e-06, "loss": -0.0, "num_tokens": 3383896.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6463708803057671, "epoch": 1.40625, "frac_reward_zero_std": 0.0, "grad_norm": 0.25290825963020325, "learning_rate": 1.2157894736842107e-06, "loss": 0.0, "num_tokens": 3388760.0, "reward": 0.7478749752044678, "reward_std": 0.49045124650001526, "rewards/reward_func/mean": 0.7478749752044678, "rewards/reward_func/std": 0.4541146159172058, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7543365843594074, "epoch": 1.408203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24611547589302063, "learning_rate": 1.2105263157894738e-06, "loss": 0.0, "num_tokens": 3393488.0, "reward": 0.8598625063896179, "reward_std": 0.2406606525182724, "rewards/reward_func/mean": 0.8598625063896179, "rewards/reward_func/std": 0.3235507011413574, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.2327856719493866, "epoch": 1.41015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.1801598221063614, "learning_rate": 1.205263157894737e-06, "loss": -0.0, "num_tokens": 3398000.0, "reward": 0.7059999704360962, "reward_std": 0.2859524190425873, "rewards/reward_func/mean": 0.7059999704360962, "rewards/reward_func/std": 0.43642914295196533, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4816428292542696, "epoch": 1.412109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20217570662498474, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "num_tokens": 3402960.0, "reward": 0.10868749767541885, "reward_std": 0.210587739944458, "rewards/reward_func/mean": 0.10868749767541885, "rewards/reward_func/std": 0.2923278510570526, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5045717675238848, "epoch": 1.4140625, "frac_reward_zero_std": 0.5, "grad_norm": 0.12922586500644684, "learning_rate": 1.1947368421052633e-06, "loss": 0.0, "num_tokens": 3407472.0, "reward": 0.805899977684021, "reward_std": 0.22783933579921722, "rewards/reward_func/mean": 0.805899977684021, "rewards/reward_func/std": 0.326308935880661, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.551116107031703, "epoch": 1.416015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2109834998846054, "learning_rate": 1.1894736842105265e-06, "loss": -0.0, "num_tokens": 3412256.0, "reward": 0.24056249856948853, "reward_std": 0.47697657346725464, "rewards/reward_func/mean": 0.24056249856948853, "rewards/reward_func/std": 0.44159868359565735, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5529836490750313, "epoch": 1.41796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22575944662094116, "learning_rate": 1.1842105263157894e-06, "loss": 0.0, "num_tokens": 3417056.0, "reward": 0.4667624831199646, "reward_std": 0.535490095615387, "rewards/reward_func/mean": 0.4667624831199646, "rewards/reward_func/std": 0.4958255887031555, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.39940457232296467, "epoch": 1.419921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18727923929691315, "learning_rate": 1.1789473684210526e-06, "loss": 0.0, "num_tokens": 3421848.0, "reward": 0.48112499713897705, "reward_std": 0.548362135887146, "rewards/reward_func/mean": 0.48112499713897705, "rewards/reward_func/std": 0.5076847076416016, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7712520267814398, "epoch": 1.421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23952317237854004, "learning_rate": 1.173684210526316e-06, "loss": -0.0, "num_tokens": 3426856.0, "reward": 0.23100000619888306, "reward_std": 0.4619999825954437, "rewards/reward_func/mean": 0.23100000619888306, "rewards/reward_func/std": 0.42808806896209717, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7155905775725842, "epoch": 1.423828125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14729009568691254, "learning_rate": 1.1684210526315791e-06, "loss": -0.0, "num_tokens": 3431856.0, "reward": 0.3450999855995178, "reward_std": 0.2304711490869522, "rewards/reward_func/mean": 0.3450999855995178, "rewards/reward_func/std": 0.47661837935447693, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5209857858717442, "epoch": 1.42578125, "frac_reward_zero_std": 0.5, "grad_norm": 0.17405037581920624, "learning_rate": 1.163157894736842e-06, "loss": -0.0, "num_tokens": 3436200.0, "reward": 0.7718750238418579, "reward_std": 0.2634160816669464, "rewards/reward_func/mean": 0.7718750238418579, "rewards/reward_func/std": 0.4224054515361786, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.75539655610919, "epoch": 1.427734375, "frac_reward_zero_std": 0.5, "grad_norm": 0.18450260162353516, "learning_rate": 1.1578947368421053e-06, "loss": 0.0, "num_tokens": 3440560.0, "reward": 0.7578125, "reward_std": 0.2797587811946869, "rewards/reward_func/mean": 0.7578125, "rewards/reward_func/std": 0.4485560953617096, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6287768185138702, "epoch": 1.4296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21361912786960602, "learning_rate": 1.1526315789473686e-06, "loss": -0.0, "num_tokens": 3445208.0, "reward": 0.8587499856948853, "reward_std": 0.2550273835659027, "rewards/reward_func/mean": 0.8587499856948853, "rewards/reward_func/std": 0.3472658097743988, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8078473433852196, "epoch": 1.431640625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1746349185705185, "learning_rate": 1.1473684210526316e-06, "loss": 0.0, "num_tokens": 3449704.0, "reward": 0.753125011920929, "reward_std": 0.2851123511791229, "rewards/reward_func/mean": 0.753125011920929, "rewards/reward_func/std": 0.4571725130081177, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6579478699713945, "epoch": 1.43359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20288410782814026, "learning_rate": 1.1421052631578947e-06, "loss": 0.0, "num_tokens": 3454664.0, "reward": 0.3410625159740448, "reward_std": 0.4895520806312561, "rewards/reward_func/mean": 0.3410625159740448, "rewards/reward_func/std": 0.4675552546977997, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6510977298021317, "epoch": 1.435546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21478044986724854, "learning_rate": 1.136842105263158e-06, "loss": 0.0, "num_tokens": 3459456.0, "reward": 0.7142499685287476, "reward_std": 0.28147342801094055, "rewards/reward_func/mean": 0.7142499685287476, "rewards/reward_func/std": 0.4409100115299225, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.22573751397430897, "epoch": 1.4375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.1315789473684213e-06, "loss": 0.0, "num_tokens": 3463960.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6214800626039505, "epoch": 1.439453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2706398665904999, "learning_rate": 1.1263157894736842e-06, "loss": -0.0, "num_tokens": 3468760.0, "reward": 0.6126375198364258, "reward_std": 0.5190345048904419, "rewards/reward_func/mean": 0.6126375198364258, "rewards/reward_func/std": 0.4971480667591095, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.860218707472086, "epoch": 1.44140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24548137187957764, "learning_rate": 1.1210526315789474e-06, "loss": 0.0, "num_tokens": 3473256.0, "reward": 0.6269875168800354, "reward_std": 0.53028404712677, "rewards/reward_func/mean": 0.6269874572753906, "rewards/reward_func/std": 0.5090279579162598, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 1.0728548988699913, "epoch": 1.443359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.27242031693458557, "learning_rate": 1.1157894736842106e-06, "loss": -0.0, "num_tokens": 3477752.0, "reward": 0.754687488079071, "reward_std": 0.4906249940395355, "rewards/reward_func/mean": 0.754687488079071, "rewards/reward_func/std": 0.45434102416038513, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5216426700353622, "epoch": 1.4453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21481481194496155, "learning_rate": 1.110526315789474e-06, "loss": -0.0, "num_tokens": 3482712.0, "reward": 0.11024999618530273, "reward_std": 0.2122509479522705, "rewards/reward_func/mean": 0.11024999618530273, "rewards/reward_func/std": 0.29184964299201965, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5817220564931631, "epoch": 1.447265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2324642688035965, "learning_rate": 1.1052631578947369e-06, "loss": 0.0, "num_tokens": 3487360.0, "reward": 0.36861249804496765, "reward_std": 0.5240679979324341, "rewards/reward_func/mean": 0.36861249804496765, "rewards/reward_func/std": 0.5054256319999695, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6618504039943218, "epoch": 1.44921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22483286261558533, "learning_rate": 1.1e-06, "loss": -0.0, "num_tokens": 3492376.0, "reward": 0.3586999773979187, "reward_std": 0.513043999671936, "rewards/reward_func/mean": 0.3586999773979187, "rewards/reward_func/std": 0.4951876997947693, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8393836617469788, "epoch": 1.451171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.27230191230773926, "learning_rate": 1.0947368421052632e-06, "loss": 0.0, "num_tokens": 3497008.0, "reward": 0.7250000238418579, "reward_std": 0.4834516644477844, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.44761592149734497, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.675093300640583, "epoch": 1.453125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14707615971565247, "learning_rate": 1.0894736842105264e-06, "loss": 0.0, "num_tokens": 3501864.0, "reward": 0.8765624761581421, "reward_std": 0.24687500298023224, "rewards/reward_func/mean": 0.8765624761581421, "rewards/reward_func/std": 0.34913399815559387, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7936474606394768, "epoch": 1.455078125, "frac_reward_zero_std": 0.5, "grad_norm": 0.19649529457092285, "learning_rate": 1.0842105263157895e-06, "loss": 0.0, "num_tokens": 3506456.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5444208905100822, "epoch": 1.45703125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1466275006532669, "learning_rate": 1.0789473684210527e-06, "loss": 0.0, "num_tokens": 3511136.0, "reward": 0.7593749761581421, "reward_std": 0.2780371904373169, "rewards/reward_func/mean": 0.7593749761581421, "rewards/reward_func/std": 0.44575127959251404, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4803670160472393, "epoch": 1.458984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20859357714653015, "learning_rate": 1.0736842105263159e-06, "loss": 0.0, "num_tokens": 3515912.0, "reward": 0.23980000615119934, "reward_std": 0.4796000123023987, "rewards/reward_func/mean": 0.23980000615119934, "rewards/reward_func/std": 0.4441418945789337, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5034095216542482, "epoch": 1.4609375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1568986475467682, "learning_rate": 1.068421052631579e-06, "loss": -0.0, "num_tokens": 3520712.0, "reward": 0.1288749873638153, "reward_std": 0.23274998366832733, "rewards/reward_func/mean": 0.1288749873638153, "rewards/reward_func/std": 0.33443784713745117, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7676276192069054, "epoch": 1.462890625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1505260467529297, "learning_rate": 1.0631578947368422e-06, "loss": 0.0, "num_tokens": 3525728.0, "reward": 0.22550000250339508, "reward_std": 0.2604222893714905, "rewards/reward_func/mean": 0.22550000250339508, "rewards/reward_func/std": 0.41758477687835693, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7294511571526527, "epoch": 1.46484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2366306334733963, "learning_rate": 1.0578947368421054e-06, "loss": 0.0, "num_tokens": 3530360.0, "reward": 0.8474999666213989, "reward_std": 0.2550317049026489, "rewards/reward_func/mean": 0.8474999666213989, "rewards/reward_func/std": 0.34281647205352783, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7323691435158253, "epoch": 1.466796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.28612589836120605, "learning_rate": 1.0526315789473685e-06, "loss": 0.0, "num_tokens": 3535048.0, "reward": 0.7447500228881836, "reward_std": 0.4966987073421478, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.659924540668726, "epoch": 1.46875, "frac_reward_zero_std": 0.0, "grad_norm": 0.25231942534446716, "learning_rate": 1.0473684210526317e-06, "loss": -0.0, "num_tokens": 3539400.0, "reward": 0.8592499494552612, "reward_std": 0.25882306694984436, "rewards/reward_func/mean": 0.8592499494552612, "rewards/reward_func/std": 0.3478110134601593, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6127423346042633, "epoch": 1.470703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.19419050216674805, "learning_rate": 1.0421052631578949e-06, "loss": -0.0, "num_tokens": 3544256.0, "reward": 0.3825249969959259, "reward_std": 0.5170859694480896, "rewards/reward_func/mean": 0.3825249969959259, "rewards/reward_func/std": 0.49410948157310486, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5326105952262878, "epoch": 1.47265625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1416216343641281, "learning_rate": 1.036842105263158e-06, "loss": -0.0, "num_tokens": 3549040.0, "reward": 0.12262499332427979, "reward_std": 0.23698993027210236, "rewards/reward_func/mean": 0.12262499332427979, "rewards/reward_func/std": 0.3368479907512665, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.65166400000453, "epoch": 1.474609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.26254624128341675, "learning_rate": 1.0315789473684212e-06, "loss": 0.0, "num_tokens": 3554032.0, "reward": 0.23919999599456787, "reward_std": 0.47839999198913574, "rewards/reward_func/mean": 0.23919999599456787, "rewards/reward_func/std": 0.44306278228759766, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6490476988255978, "epoch": 1.4765625, "frac_reward_zero_std": 0.5, "grad_norm": 0.15658609569072723, "learning_rate": 1.0263157894736843e-06, "loss": 0.0, "num_tokens": 3558904.0, "reward": 0.878125011920929, "reward_std": 0.24375002086162567, "rewards/reward_func/mean": 0.878125011920929, "rewards/reward_func/std": 0.34471458196640015, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4935445673763752, "epoch": 1.478515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21735656261444092, "learning_rate": 1.0210526315789475e-06, "loss": 0.0, "num_tokens": 3563760.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6523591168224812, "epoch": 1.48046875, "frac_reward_zero_std": 0.5, "grad_norm": 0.26018184423446655, "learning_rate": 1.0157894736842105e-06, "loss": -0.0, "num_tokens": 3568288.0, "reward": 0.8710500001907349, "reward_std": 0.2310333549976349, "rewards/reward_func/mean": 0.8710500001907349, "rewards/reward_func/std": 0.3324243724346161, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8797458559274673, "epoch": 1.482421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2709335386753082, "learning_rate": 1.0105263157894738e-06, "loss": 0.0, "num_tokens": 3572824.0, "reward": 0.49787500500679016, "reward_std": 0.567833423614502, "rewards/reward_func/mean": 0.49787500500679016, "rewards/reward_func/std": 0.5257702469825745, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49080283008515835, "epoch": 1.484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.16781467199325562, "learning_rate": 1.005263157894737e-06, "loss": -0.0, "num_tokens": 3577312.0, "reward": 0.9384000301361084, "reward_std": 0.01759999990463257, "rewards/reward_func/mean": 0.9384000301361084, "rewards/reward_func/std": 0.02489016018807888, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6200328879058361, "epoch": 1.486328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21412330865859985, "learning_rate": 1.0000000000000002e-06, "loss": -0.0, "num_tokens": 3581960.0, "reward": 0.49184998869895935, "reward_std": 0.5679755210876465, "rewards/reward_func/mean": 0.49184998869895935, "rewards/reward_func/std": 0.5259869694709778, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.32999411411583424, "epoch": 1.48828125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.947368421052631e-07, "loss": 0.0, "num_tokens": 3586456.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5745944939553738, "epoch": 1.490234375, "frac_reward_zero_std": 0.5, "grad_norm": 0.14827704429626465, "learning_rate": 9.894736842105265e-07, "loss": -0.0, "num_tokens": 3591216.0, "reward": 0.3511999845504761, "reward_std": 0.23414616286754608, "rewards/reward_func/mean": 0.3511999845504761, "rewards/reward_func/std": 0.484712690114975, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7484982162714005, "epoch": 1.4921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2768227159976959, "learning_rate": 9.842105263157897e-07, "loss": 0.0, "num_tokens": 3595704.0, "reward": 0.8731499910354614, "reward_std": 0.2419903725385666, "rewards/reward_func/mean": 0.8731499910354614, "rewards/reward_func/std": 0.3327745497226715, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.558552972972393, "epoch": 1.494140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24600715935230255, "learning_rate": 9.789473684210526e-07, "loss": -0.0, "num_tokens": 3600560.0, "reward": 0.353300005197525, "reward_std": 0.5092028975486755, "rewards/reward_func/mean": 0.353300005197525, "rewards/reward_func/std": 0.4877893924713135, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.627804696559906, "epoch": 1.49609375, "frac_reward_zero_std": 0.5, "grad_norm": 0.34988823533058167, "learning_rate": 9.736842105263158e-07, "loss": 0.0, "num_tokens": 3605400.0, "reward": 0.11959999799728394, "reward_std": 0.23919998109340668, "rewards/reward_func/mean": 0.11959999799728394, "rewards/reward_func/std": 0.3382799029350281, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.40967404283583164, "epoch": 1.498046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20395620167255402, "learning_rate": 9.68421052631579e-07, "loss": -0.0, "num_tokens": 3610384.0, "reward": 0.33976250886917114, "reward_std": 0.2288047820329666, "rewards/reward_func/mean": 0.33976250886917114, "rewards/reward_func/std": 0.4656626880168915, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.758793193846941, "epoch": 1.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.28868770599365234, "learning_rate": 9.631578947368423e-07, "loss": 0.0, "num_tokens": 3615248.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8679467141628265, "epoch": 1.501953125, "frac_reward_zero_std": 0.5, "grad_norm": 0.19479088485240936, "learning_rate": 9.578947368421053e-07, "loss": 0.0, "num_tokens": 3619800.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.665732316672802, "epoch": 1.50390625, "frac_reward_zero_std": 0.5, "grad_norm": 0.17135152220726013, "learning_rate": 9.526315789473685e-07, "loss": 0.0, "num_tokens": 3624288.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.654691674746573, "epoch": 1.505859375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1414864957332611, "learning_rate": 9.473684210526317e-07, "loss": 0.0, "num_tokens": 3629144.0, "reward": 0.6349375247955322, "reward_std": 0.22897498309612274, "rewards/reward_func/mean": 0.6349375247955322, "rewards/reward_func/std": 0.4740232825279236, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5642881374806166, "epoch": 1.5078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2399088591337204, "learning_rate": 9.421052631578948e-07, "loss": -0.0, "num_tokens": 3633928.0, "reward": 0.3761124610900879, "reward_std": 0.49379342794418335, "rewards/reward_func/mean": 0.3761124908924103, "rewards/reward_func/std": 0.47505778074264526, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8916216120123863, "epoch": 1.509765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.25461268424987793, "learning_rate": 9.368421052631579e-07, "loss": -0.0, "num_tokens": 3638416.0, "reward": 0.7593749761581421, "reward_std": 0.48124998807907104, "rewards/reward_func/mean": 0.7593749761581421, "rewards/reward_func/std": 0.44555091857910156, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3287378936074674, "epoch": 1.51171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.16437959671020508, "learning_rate": 9.315789473684212e-07, "loss": 0.0, "num_tokens": 3642912.0, "reward": 0.6930000185966492, "reward_std": 0.2701496481895447, "rewards/reward_func/mean": 0.6930000185966492, "rewards/reward_func/std": 0.4277917444705963, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5501508843153715, "epoch": 1.513671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19080518186092377, "learning_rate": 9.263157894736844e-07, "loss": -0.0, "num_tokens": 3647880.0, "reward": 0.3418999910354614, "reward_std": 0.21971121430397034, "rewards/reward_func/mean": 0.3418999910354614, "rewards/reward_func/std": 0.4448350965976715, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6878321021795273, "epoch": 1.515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24576818943023682, "learning_rate": 9.210526315789474e-07, "loss": -0.0, "num_tokens": 3652888.0, "reward": 0.35216251015663147, "reward_std": 0.5052483081817627, "rewards/reward_func/mean": 0.35216251015663147, "rewards/reward_func/std": 0.4830230474472046, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8899596408009529, "epoch": 1.517578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2556166350841522, "learning_rate": 9.157894736842106e-07, "loss": 0.0, "num_tokens": 3657672.0, "reward": 0.7323875427246094, "reward_std": 0.2783551812171936, "rewards/reward_func/mean": 0.7323875427246094, "rewards/reward_func/std": 0.42598211765289307, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6734718568623066, "epoch": 1.51953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22433032095432281, "learning_rate": 9.105263157894737e-07, "loss": 0.0, "num_tokens": 3662448.0, "reward": 0.4796375036239624, "reward_std": 0.5285993814468384, "rewards/reward_func/mean": 0.4796375036239624, "rewards/reward_func/std": 0.489534467458725, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.47480286099016666, "epoch": 1.521484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20607703924179077, "learning_rate": 9.05263157894737e-07, "loss": -0.0, "num_tokens": 3667216.0, "reward": 0.3725624680519104, "reward_std": 0.5063344240188599, "rewards/reward_func/mean": 0.3725624680519104, "rewards/reward_func/std": 0.48398149013519287, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5787655636668205, "epoch": 1.5234375, "frac_reward_zero_std": 0.5, "grad_norm": 0.19025181233882904, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "num_tokens": 3672056.0, "reward": 0.11410000175237656, "reward_std": 0.2282000035047531, "rewards/reward_func/mean": 0.11410000175237656, "rewards/reward_func/std": 0.3227235674858093, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 1.0042685344815254, "epoch": 1.525390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2973874509334564, "learning_rate": 8.947368421052632e-07, "loss": 0.0, "num_tokens": 3676552.0, "reward": 0.874287486076355, "reward_std": 0.24427926540374756, "rewards/reward_func/mean": 0.874287486076355, "rewards/reward_func/std": 0.3382539749145508, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7004695013165474, "epoch": 1.52734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24851390719413757, "learning_rate": 8.894736842105264e-07, "loss": -0.0, "num_tokens": 3680912.0, "reward": 0.8754374980926514, "reward_std": 0.23534303903579712, "rewards/reward_func/mean": 0.8754374980926514, "rewards/reward_func/std": 0.3189397156238556, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6289892308413982, "epoch": 1.529296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.3434109687805176, "learning_rate": 8.842105263157895e-07, "loss": 0.0, "num_tokens": 3685568.0, "reward": 0.4922749996185303, "reward_std": 0.4815645217895508, "rewards/reward_func/mean": 0.4922749996185303, "rewards/reward_func/std": 0.519862949848175, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49262196384370327, "epoch": 1.53125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2120276689529419, "learning_rate": 8.789473684210527e-07, "loss": 0.0, "num_tokens": 3690344.0, "reward": 0.24681249260902405, "reward_std": 0.47292011976242065, "rewards/reward_func/mean": 0.24681249260902405, "rewards/reward_func/std": 0.4378432333469391, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6302190236747265, "epoch": 1.533203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22514617443084717, "learning_rate": 8.736842105263159e-07, "loss": -0.0, "num_tokens": 3695136.0, "reward": 0.24524998664855957, "reward_std": 0.4741288423538208, "rewards/reward_func/mean": 0.24524998664855957, "rewards/reward_func/std": 0.43902352452278137, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5339977368712425, "epoch": 1.53515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2869108319282532, "learning_rate": 8.68421052631579e-07, "loss": -0.0, "num_tokens": 3700064.0, "reward": 0.7380625009536743, "reward_std": 0.2860471308231354, "rewards/reward_func/mean": 0.7380625009536743, "rewards/reward_func/std": 0.45175832509994507, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7544026896357536, "epoch": 1.537109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.26479172706604004, "learning_rate": 8.631578947368421e-07, "loss": -0.0, "num_tokens": 3704800.0, "reward": 0.5, "reward_std": 0.5, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5345224738121033, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7368651237338781, "epoch": 1.5390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24209897220134735, "learning_rate": 8.578947368421054e-07, "loss": 0.0, "num_tokens": 3710008.0, "reward": 0.6328125, "reward_std": 0.529944896697998, "rewards/reward_func/mean": 0.6328125, "rewards/reward_func/std": 0.5071338415145874, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7057337984442711, "epoch": 1.541015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20642440021038055, "learning_rate": 8.526315789473685e-07, "loss": -0.0, "num_tokens": 3714688.0, "reward": 0.8713124990463257, "reward_std": 0.25737500190734863, "rewards/reward_func/mean": 0.8713124990463257, "rewards/reward_func/std": 0.3473237454891205, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6739199366420507, "epoch": 1.54296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2283017486333847, "learning_rate": 8.473684210526316e-07, "loss": 0.0, "num_tokens": 3719672.0, "reward": 0.3151249885559082, "reward_std": 0.4546101689338684, "rewards/reward_func/mean": 0.3151249885559082, "rewards/reward_func/std": 0.43122467398643494, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.37616512551903725, "epoch": 1.544921875, "frac_reward_zero_std": 0.5, "grad_norm": 0.13778182864189148, "learning_rate": 8.421052631578948e-07, "loss": -0.0, "num_tokens": 3724448.0, "reward": 0.49831247329711914, "reward_std": 0.023593232035636902, "rewards/reward_func/mean": 0.49831250309944153, "rewards/reward_func/std": 0.4902626574039459, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5549918822944164, "epoch": 1.546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22168178856372833, "learning_rate": 8.36842105263158e-07, "loss": 0.0, "num_tokens": 3729432.0, "reward": 0.12794999778270721, "reward_std": 0.22315937280654907, "rewards/reward_func/mean": 0.12794999778270721, "rewards/reward_func/std": 0.3018590807914734, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.870604507625103, "epoch": 1.548828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22514410316944122, "learning_rate": 8.315789473684212e-07, "loss": 0.0, "num_tokens": 3733952.0, "reward": 0.9918999671936035, "reward_std": 0.011635387316346169, "rewards/reward_func/mean": 0.9918999671936035, "rewards/reward_func/std": 0.011179066263139248, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.48082075640559196, "epoch": 1.55078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.22919267416000366, "learning_rate": 8.263157894736843e-07, "loss": 0.0, "num_tokens": 3738736.0, "reward": 0.3589249849319458, "reward_std": 0.5067580938339233, "rewards/reward_func/mean": 0.3589249849319458, "rewards/reward_func/std": 0.4885357916355133, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5452318023890257, "epoch": 1.552734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19861352443695068, "learning_rate": 8.210526315789474e-07, "loss": -0.0, "num_tokens": 3743696.0, "reward": 0.12037499994039536, "reward_std": 0.22721248865127563, "rewards/reward_func/mean": 0.12037499248981476, "rewards/reward_func/std": 0.31041398644447327, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6286281645298004, "epoch": 1.5546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20838730037212372, "learning_rate": 8.157894736842106e-07, "loss": 0.0, "num_tokens": 3748664.0, "reward": 0.32600000500679016, "reward_std": 0.46434351801872253, "rewards/reward_func/mean": 0.32600000500679016, "rewards/reward_func/std": 0.45025455951690674, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6037598215043545, "epoch": 1.556640625, "frac_reward_zero_std": 0.5, "grad_norm": 0.18488991260528564, "learning_rate": 8.105263157894736e-07, "loss": -0.0, "num_tokens": 3753024.0, "reward": 0.754687488079071, "reward_std": 0.2833658754825592, "rewards/reward_func/mean": 0.754687488079071, "rewards/reward_func/std": 0.45434102416038513, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7057324796915054, "epoch": 1.55859375, "frac_reward_zero_std": 0.5, "grad_norm": 0.18174698948860168, "learning_rate": 8.052631578947369e-07, "loss": 0.0, "num_tokens": 3757744.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5243270434439182, "epoch": 1.560546875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1930585652589798, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "num_tokens": 3762088.0, "reward": 0.8765624761581421, "reward_std": 0.24687500298023224, "rewards/reward_func/mean": 0.8765624761581421, "rewards/reward_func/std": 0.34913399815559387, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7073042988777161, "epoch": 1.5625, "frac_reward_zero_std": 0.5, "grad_norm": 0.15252089500427246, "learning_rate": 7.947368421052632e-07, "loss": 0.0, "num_tokens": 3766704.0, "reward": 0.8503124713897705, "reward_std": 0.23873114585876465, "rewards/reward_func/mean": 0.8503124713897705, "rewards/reward_func/std": 0.33856281638145447, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4393083956092596, "epoch": 1.564453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21687999367713928, "learning_rate": 7.894736842105263e-07, "loss": 0.0, "num_tokens": 3771488.0, "reward": 0.4840624928474426, "reward_std": 0.5265330076217651, "rewards/reward_func/mean": 0.4840624928474426, "rewards/reward_func/std": 0.48748067021369934, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7102243229746819, "epoch": 1.56640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2167195975780487, "learning_rate": 7.842105263157896e-07, "loss": 0.0, "num_tokens": 3776256.0, "reward": 0.7418375015258789, "reward_std": 0.4656052589416504, "rewards/reward_func/mean": 0.7418375015258789, "rewards/reward_func/std": 0.4318162798881531, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.520836379379034, "epoch": 1.568359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.23460784554481506, "learning_rate": 7.789473684210527e-07, "loss": -0.0, "num_tokens": 3781024.0, "reward": 0.3558874726295471, "reward_std": 0.5002304315567017, "rewards/reward_func/mean": 0.3558875024318695, "rewards/reward_func/std": 0.48085901141166687, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5246933065354824, "epoch": 1.5703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20828157663345337, "learning_rate": 7.736842105263158e-07, "loss": 0.0, "num_tokens": 3785672.0, "reward": 0.49729999899864197, "reward_std": 0.4982522130012512, "rewards/reward_func/mean": 0.49729999899864197, "rewards/reward_func/std": 0.531683087348938, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4413669481873512, "epoch": 1.572265625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.684210526315789e-07, "loss": 0.0, "num_tokens": 3790176.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5796146206557751, "epoch": 1.57421875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.631578947368422e-07, "loss": 0.0, "num_tokens": 3794976.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.531171353533864, "epoch": 1.576171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22131916880607605, "learning_rate": 7.578947368421054e-07, "loss": -0.0, "num_tokens": 3799768.0, "reward": 0.37181249260902405, "reward_std": 0.5167224407196045, "rewards/reward_func/mean": 0.37181249260902405, "rewards/reward_func/std": 0.49653398990631104, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6098550744354725, "epoch": 1.578125, "frac_reward_zero_std": 0.5, "grad_norm": 0.1398448944091797, "learning_rate": 7.526315789473684e-07, "loss": 0.0, "num_tokens": 3804392.0, "reward": 0.7366249561309814, "reward_std": 0.2787158489227295, "rewards/reward_func/mean": 0.7366249561309814, "rewards/reward_func/std": 0.4469396471977234, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.715061454102397, "epoch": 1.580078125, "frac_reward_zero_std": 0.5, "grad_norm": 0.23018530011177063, "learning_rate": 7.473684210526316e-07, "loss": 0.0, "num_tokens": 3809256.0, "reward": 0.8765624761581421, "reward_std": 0.24687500298023224, "rewards/reward_func/mean": 0.8765624761581421, "rewards/reward_func/std": 0.34913399815559387, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6622244138270617, "epoch": 1.58203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.19872058928012848, "learning_rate": 7.421052631578949e-07, "loss": -0.0, "num_tokens": 3814208.0, "reward": 0.015625, "reward_std": 0.02392766997218132, "rewards/reward_func/mean": 0.015625, "rewards/reward_func/std": 0.0265165064483881, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.41381765250116587, "epoch": 1.583984375, "frac_reward_zero_std": 0.5, "grad_norm": 0.13804210722446442, "learning_rate": 7.368421052631579e-07, "loss": 0.0, "num_tokens": 3818696.0, "reward": 0.6972000002861023, "reward_std": 0.2683524191379547, "rewards/reward_func/mean": 0.6972000002861023, "rewards/reward_func/std": 0.43032118678092957, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7343226224184036, "epoch": 1.5859375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.315789473684211e-07, "loss": 0.0, "num_tokens": 3823568.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5337802059948444, "epoch": 1.587890625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1350008249282837, "learning_rate": 7.263157894736843e-07, "loss": -0.0, "num_tokens": 3828216.0, "reward": 0.9947500228881836, "reward_std": 0.010500003583729267, "rewards/reward_func/mean": 0.9947500228881836, "rewards/reward_func/std": 0.014849240891635418, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6469601839780807, "epoch": 1.58984375, "frac_reward_zero_std": 0.5, "grad_norm": 0.13170914351940155, "learning_rate": 7.210526315789475e-07, "loss": 0.0, "num_tokens": 3833048.0, "reward": 0.11959999799728394, "reward_std": 0.23919998109340668, "rewards/reward_func/mean": 0.11959999799728394, "rewards/reward_func/std": 0.3382799029350281, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6435871571302414, "epoch": 1.591796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21571120619773865, "learning_rate": 7.157894736842106e-07, "loss": -0.0, "num_tokens": 3837800.0, "reward": 0.7281500101089478, "reward_std": 0.4856353998184204, "rewards/reward_func/mean": 0.7281500101089478, "rewards/reward_func/std": 0.44976165890693665, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7475415430963039, "epoch": 1.59375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22343456745147705, "learning_rate": 7.105263157894737e-07, "loss": -0.0, "num_tokens": 3842544.0, "reward": 0.7112499475479126, "reward_std": 0.45749998092651367, "rewards/reward_func/mean": 0.7112500071525574, "rewards/reward_func/std": 0.4237734377384186, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7341870330274105, "epoch": 1.595703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23006655275821686, "learning_rate": 7.052631578947369e-07, "loss": 0.0, "num_tokens": 3846872.0, "reward": 0.6531250476837158, "reward_std": 0.49706268310546875, "rewards/reward_func/mean": 0.653124988079071, "rewards/reward_func/std": 0.4788728654384613, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9006102904677391, "epoch": 1.59765625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1597568690776825, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "num_tokens": 3851368.0, "reward": 0.9973000288009644, "reward_std": 0.0054000020027160645, "rewards/reward_func/mean": 0.9973000288009644, "rewards/reward_func/std": 0.007636756170541048, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6276929546147585, "epoch": 1.599609375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.947368421052631e-07, "loss": 0.0, "num_tokens": 3855840.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7189521789550781, "epoch": 1.6015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.25209999084472656, "learning_rate": 6.894736842105264e-07, "loss": 0.0, "num_tokens": 3860840.0, "reward": 0.5924999713897705, "reward_std": 0.5084478855133057, "rewards/reward_func/mean": 0.5924999713897705, "rewards/reward_func/std": 0.49154552817344666, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.49646480195224285, "epoch": 1.603515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19341827929019928, "learning_rate": 6.842105263157896e-07, "loss": -0.0, "num_tokens": 3865624.0, "reward": 0.36006247997283936, "reward_std": 0.513949990272522, "rewards/reward_func/mean": 0.36006247997283936, "rewards/reward_func/std": 0.493501216173172, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5097112730145454, "epoch": 1.60546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.20900575816631317, "learning_rate": 6.789473684210526e-07, "loss": 0.0, "num_tokens": 3870608.0, "reward": 0.2334499955177307, "reward_std": 0.2624872624874115, "rewards/reward_func/mean": 0.2334500104188919, "rewards/reward_func/std": 0.4170190095901489, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4433702193200588, "epoch": 1.607421875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.736842105263158e-07, "loss": 0.0, "num_tokens": 3875120.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7894489616155624, "epoch": 1.609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21178889274597168, "learning_rate": 6.684210526315791e-07, "loss": 0.0, "num_tokens": 3880120.0, "reward": 0.34780001640319824, "reward_std": 0.5050222873687744, "rewards/reward_func/mean": 0.34780001640319824, "rewards/reward_func/std": 0.4806229770183563, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.39088401943445206, "epoch": 1.611328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24352553486824036, "learning_rate": 6.631578947368422e-07, "loss": 0.0, "num_tokens": 3885080.0, "reward": 0.3395000100135803, "reward_std": 0.49134349822998047, "rewards/reward_func/mean": 0.3395000100135803, "rewards/reward_func/std": 0.4688292443752289, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 1.138112135231495, "epoch": 1.61328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2556565999984741, "learning_rate": 6.578947368421053e-07, "loss": 0.0, "num_tokens": 3889560.0, "reward": 0.5066750049591064, "reward_std": 0.5636498928070068, "rewards/reward_func/mean": 0.5066750049591064, "rewards/reward_func/std": 0.5218542218208313, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7840673625469208, "epoch": 1.615234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21557015180587769, "learning_rate": 6.526315789473684e-07, "loss": 0.0, "num_tokens": 3894040.0, "reward": 0.7535499930381775, "reward_std": 0.4857522249221802, "rewards/reward_func/mean": 0.7535499930381775, "rewards/reward_func/std": 0.4499310553073883, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5850768871605396, "epoch": 1.6171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.24189525842666626, "learning_rate": 6.473684210526317e-07, "loss": -0.0, "num_tokens": 3898688.0, "reward": 0.6219249963760376, "reward_std": 0.526694655418396, "rewards/reward_func/mean": 0.6219249963760376, "rewards/reward_func/std": 0.508267343044281, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6123648304492235, "epoch": 1.619140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2563243508338928, "learning_rate": 6.421052631578948e-07, "loss": 0.0, "num_tokens": 3903656.0, "reward": 0.125062495470047, "reward_std": 0.22389693558216095, "rewards/reward_func/mean": 0.1250625103712082, "rewards/reward_func/std": 0.3084697723388672, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8369600027799606, "epoch": 1.62109375, "frac_reward_zero_std": 0.5, "grad_norm": 0.17828819155693054, "learning_rate": 6.368421052631579e-07, "loss": 0.0, "num_tokens": 3908192.0, "reward": 0.7562500238418579, "reward_std": 0.2814582586288452, "rewards/reward_func/mean": 0.7562500238418579, "rewards/reward_func/std": 0.4513373076915741, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.638560563325882, "epoch": 1.623046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19805623590946198, "learning_rate": 6.315789473684211e-07, "loss": 0.0, "num_tokens": 3912848.0, "reward": 0.5066750049591064, "reward_std": 0.5637478828430176, "rewards/reward_func/mean": 0.5066750049591064, "rewards/reward_func/std": 0.5220252871513367, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5707159228622913, "epoch": 1.625, "frac_reward_zero_std": 0.0, "grad_norm": 0.18247583508491516, "learning_rate": 6.263157894736844e-07, "loss": -0.0, "num_tokens": 3917504.0, "reward": 0.3765625059604645, "reward_std": 0.5376508831977844, "rewards/reward_func/mean": 0.3765625059604645, "rewards/reward_func/std": 0.5162726044654846, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7538819462060928, "epoch": 1.626953125, "frac_reward_zero_std": 0.5, "grad_norm": 0.17551633715629578, "learning_rate": 6.210526315789474e-07, "loss": 0.0, "num_tokens": 3922512.0, "reward": 0.12229999899864197, "reward_std": 0.24459999799728394, "rewards/reward_func/mean": 0.12229999899864197, "rewards/reward_func/std": 0.34591662883758545, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4829423278570175, "epoch": 1.62890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20213103294372559, "learning_rate": 6.157894736842106e-07, "loss": 0.0, "num_tokens": 3927296.0, "reward": 0.3590875267982483, "reward_std": 0.2541199028491974, "rewards/reward_func/mean": 0.3590874969959259, "rewards/reward_func/std": 0.4724847376346588, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5105188228189945, "epoch": 1.630859375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1484820693731308, "learning_rate": 6.105263157894738e-07, "loss": -0.0, "num_tokens": 3932224.0, "reward": 0.7462999820709229, "reward_std": 0.26800599694252014, "rewards/reward_func/mean": 0.7462999820709229, "rewards/reward_func/std": 0.4297657012939453, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6433870941400528, "epoch": 1.6328125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24502220749855042, "learning_rate": 6.052631578947369e-07, "loss": 0.0, "num_tokens": 3936848.0, "reward": 0.5093749761581421, "reward_std": 0.5669463872909546, "rewards/reward_func/mean": 0.5093749761581421, "rewards/reward_func/std": 0.5250744223594666, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4820468481630087, "epoch": 1.634765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.1734929382801056, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "num_tokens": 3941352.0, "reward": 0.8113000392913818, "reward_std": 0.23659999668598175, "rewards/reward_func/mean": 0.8113000392913818, "rewards/reward_func/std": 0.32786741852760315, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5926277190446854, "epoch": 1.63671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2382938116788864, "learning_rate": 5.947368421052632e-07, "loss": -0.0, "num_tokens": 3946144.0, "reward": 0.7357000112533569, "reward_std": 0.2871914803981781, "rewards/reward_func/mean": 0.7357000112533569, "rewards/reward_func/std": 0.45415768027305603, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7808935679495335, "epoch": 1.638671875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1889333575963974, "learning_rate": 5.894736842105263e-07, "loss": 0.0, "num_tokens": 3950680.0, "reward": 0.7504249811172485, "reward_std": 0.28198346495628357, "rewards/reward_func/mean": 0.7504249811172485, "rewards/reward_func/std": 0.4555181860923767, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8180569782853127, "epoch": 1.640625, "frac_reward_zero_std": 0.5, "grad_norm": 0.17319242656230927, "learning_rate": 5.842105263157896e-07, "loss": 0.0, "num_tokens": 3955160.0, "reward": 0.7562500238418579, "reward_std": 0.2814582586288452, "rewards/reward_func/mean": 0.7562500238418579, "rewards/reward_func/std": 0.4513373076915741, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4921247735619545, "epoch": 1.642578125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.789473684210526e-07, "loss": 0.0, "num_tokens": 3959648.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6009175051003695, "epoch": 1.64453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21017959713935852, "learning_rate": 5.736842105263158e-07, "loss": -0.0, "num_tokens": 3964288.0, "reward": 0.49517500400543213, "reward_std": 0.4928368628025055, "rewards/reward_func/mean": 0.49517500400543213, "rewards/reward_func/std": 0.5229134559631348, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6235355269163847, "epoch": 1.646484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2219318151473999, "learning_rate": 5.68421052631579e-07, "loss": 0.0, "num_tokens": 3969248.0, "reward": 0.3330000042915344, "reward_std": 0.47834351658821106, "rewards/reward_func/mean": 0.3330000042915344, "rewards/reward_func/std": 0.45958366990089417, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.44258963875472546, "epoch": 1.6484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21175144612789154, "learning_rate": 5.631578947368421e-07, "loss": -0.0, "num_tokens": 3974224.0, "reward": 0.12037499994039536, "reward_std": 0.2241630107164383, "rewards/reward_func/mean": 0.12037499994039536, "rewards/reward_func/std": 0.31041398644447327, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.691203698515892, "epoch": 1.650390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2612920105457306, "learning_rate": 5.578947368421053e-07, "loss": 0.0, "num_tokens": 3978880.0, "reward": 0.6223000288009644, "reward_std": 0.536927342414856, "rewards/reward_func/mean": 0.6223000288009644, "rewards/reward_func/std": 0.5153651237487793, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5455706492066383, "epoch": 1.65234375, "frac_reward_zero_std": 0.0, "grad_norm": 0.21940505504608154, "learning_rate": 5.526315789473684e-07, "loss": 0.0, "num_tokens": 3983728.0, "reward": 0.49701249599456787, "reward_std": 0.5561558604240417, "rewards/reward_func/mean": 0.49701249599456787, "rewards/reward_func/std": 0.5150313377380371, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5260077305138111, "epoch": 1.654296875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.473684210526316e-07, "loss": 0.0, "num_tokens": 3988360.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8520932123064995, "epoch": 1.65625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2201906442642212, "learning_rate": 5.421052631578948e-07, "loss": 0.0, "num_tokens": 3993352.0, "reward": 0.2501250207424164, "reward_std": 0.2676456868648529, "rewards/reward_func/mean": 0.2501250207424164, "rewards/reward_func/std": 0.39523282647132874, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5580763444304466, "epoch": 1.658203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.20035962760448456, "learning_rate": 5.368421052631579e-07, "loss": -0.0, "num_tokens": 3998008.0, "reward": 0.3670499920845032, "reward_std": 0.5273429155349731, "rewards/reward_func/mean": 0.3670499920845032, "rewards/reward_func/std": 0.5067015290260315, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.46416748873889446, "epoch": 1.66015625, "frac_reward_zero_std": 0.5, "grad_norm": 0.15010768175125122, "learning_rate": 5.315789473684211e-07, "loss": -0.0, "num_tokens": 4002520.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6008065789937973, "epoch": 1.662109375, "frac_reward_zero_std": 0.5, "grad_norm": 0.18306009471416473, "learning_rate": 5.263157894736843e-07, "loss": -0.0, "num_tokens": 4007264.0, "reward": 0.8224999904632568, "reward_std": 0.23499999940395355, "rewards/reward_func/mean": 0.8224999904632568, "rewards/reward_func/std": 0.33234018087387085, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.45169725827872753, "epoch": 1.6640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.20183001458644867, "learning_rate": 5.210526315789474e-07, "loss": 0.0, "num_tokens": 4012224.0, "reward": 0.452875018119812, "reward_std": 0.5022744536399841, "rewards/reward_func/mean": 0.452875018119812, "rewards/reward_func/std": 0.46521246433258057, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5721710417419672, "epoch": 1.666015625, "frac_reward_zero_std": 0.0, "grad_norm": 0.21509496867656708, "learning_rate": 5.157894736842106e-07, "loss": -0.0, "num_tokens": 4017208.0, "reward": 0.3440000116825104, "reward_std": 0.4370303750038147, "rewards/reward_func/mean": 0.3440000116825104, "rewards/reward_func/std": 0.42025572061538696, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7655346915125847, "epoch": 1.66796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.27250996232032776, "learning_rate": 5.105263157894738e-07, "loss": -0.0, "num_tokens": 4021728.0, "reward": 0.6143499612808228, "reward_std": 0.25468018651008606, "rewards/reward_func/mean": 0.6143499612808228, "rewards/reward_func/std": 0.5089049935340881, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8787706345319748, "epoch": 1.669921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.31136733293533325, "learning_rate": 5.052631578947369e-07, "loss": 0.0, "num_tokens": 4026216.0, "reward": 0.8695999979972839, "reward_std": 0.2536522150039673, "rewards/reward_func/mean": 0.8695999979972839, "rewards/reward_func/std": 0.35150691866874695, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5140975881367922, "epoch": 1.671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2465497851371765, "learning_rate": 5.000000000000001e-07, "loss": -0.0, "num_tokens": 4031008.0, "reward": 0.4752500057220459, "reward_std": 0.5488084554672241, "rewards/reward_func/mean": 0.4752500057220459, "rewards/reward_func/std": 0.5081146359443665, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6999778598546982, "epoch": 1.673828125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.947368421052632e-07, "loss": 0.0, "num_tokens": 4036000.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4997900687158108, "epoch": 1.67578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21455521881580353, "learning_rate": 4.894736842105263e-07, "loss": -0.0, "num_tokens": 4040968.0, "reward": 0.23787499964237213, "reward_std": 0.4509041905403137, "rewards/reward_func/mean": 0.23787501454353333, "rewards/reward_func/std": 0.41770508885383606, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5979567542672157, "epoch": 1.677734375, "frac_reward_zero_std": 0.5, "grad_norm": 0.17116230726242065, "learning_rate": 4.842105263157895e-07, "loss": 0.0, "num_tokens": 4045632.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8094654753804207, "epoch": 1.6796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2224608212709427, "learning_rate": 4.789473684210526e-07, "loss": -0.0, "num_tokens": 4050216.0, "reward": 0.7447500228881836, "reward_std": 0.2991751432418823, "rewards/reward_func/mean": 0.7447500228881836, "rewards/reward_func/std": 0.4598980247974396, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6717873178422451, "epoch": 1.681640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24959073960781097, "learning_rate": 4.7368421052631585e-07, "loss": 0.0, "num_tokens": 4054856.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5175492167472839, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6693816427141428, "epoch": 1.68359375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1378486603498459, "learning_rate": 4.6842105263157896e-07, "loss": 0.0, "num_tokens": 4059712.0, "reward": 0.8765624761581421, "reward_std": 0.24687500298023224, "rewards/reward_func/mean": 0.8765624761581421, "rewards/reward_func/std": 0.34913399815559387, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4828580655157566, "epoch": 1.685546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19800472259521484, "learning_rate": 4.631578947368422e-07, "loss": 0.0, "num_tokens": 4064496.0, "reward": 0.13242501020431519, "reward_std": 0.24046964943408966, "rewards/reward_func/mean": 0.132424995303154, "rewards/reward_func/std": 0.32472512125968933, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7691012695431709, "epoch": 1.6875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22753512859344482, "learning_rate": 4.578947368421053e-07, "loss": 0.0, "num_tokens": 4069032.0, "reward": 0.6379250288009644, "reward_std": 0.5109583139419556, "rewards/reward_func/mean": 0.6379250288009644, "rewards/reward_func/std": 0.4939082860946655, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6634176559746265, "epoch": 1.689453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24916769564151764, "learning_rate": 4.526315789473685e-07, "loss": -0.0, "num_tokens": 4074024.0, "reward": 0.23919999599456787, "reward_std": 0.47839999198913574, "rewards/reward_func/mean": 0.23919999599456787, "rewards/reward_func/std": 0.44306278228759766, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6947410106658936, "epoch": 1.69140625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2586158812046051, "learning_rate": 4.473684210526316e-07, "loss": 0.0, "num_tokens": 4078528.0, "reward": 0.7488625049591064, "reward_std": 0.29228225350379944, "rewards/reward_func/mean": 0.7488625049591064, "rewards/reward_func/std": 0.4584231972694397, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.38118401169776917, "epoch": 1.693359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.20857468247413635, "learning_rate": 4.421052631578947e-07, "loss": 0.0, "num_tokens": 4083312.0, "reward": 0.2487500011920929, "reward_std": 0.46485579013824463, "rewards/reward_func/mean": 0.2487500011920929, "rewards/reward_func/std": 0.4304842948913574, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6276389136910439, "epoch": 1.6953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23110730946063995, "learning_rate": 4.3684210526315794e-07, "loss": 0.0, "num_tokens": 4087952.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5682788696140051, "epoch": 1.697265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.230777308344841, "learning_rate": 4.3157894736842105e-07, "loss": -0.0, "num_tokens": 4092624.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5345224738121033, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5546636991202831, "epoch": 1.69921875, "frac_reward_zero_std": 0.5, "grad_norm": 0.15263542532920837, "learning_rate": 4.2631578947368427e-07, "loss": 0.0, "num_tokens": 4096984.0, "reward": 0.8843749761581421, "reward_std": 0.23125000298023224, "rewards/reward_func/mean": 0.8843749761581421, "rewards/reward_func/std": 0.32703688740730286, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.375160763040185, "epoch": 1.701171875, "frac_reward_zero_std": 0.0, "grad_norm": 0.18006189167499542, "learning_rate": 4.210526315789474e-07, "loss": 0.0, "num_tokens": 4101480.0, "reward": 0.46480000019073486, "reward_std": 0.46480000019073486, "rewards/reward_func/mean": 0.46480000019073486, "rewards/reward_func/std": 0.49689212441444397, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.780842486768961, "epoch": 1.703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2666405439376831, "learning_rate": 4.157894736842106e-07, "loss": -0.0, "num_tokens": 4106272.0, "reward": 0.7378374934196472, "reward_std": 0.4632420837879181, "rewards/reward_func/mean": 0.7378374934196472, "rewards/reward_func/std": 0.429521769285202, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.480727631598711, "epoch": 1.705078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2148435115814209, "learning_rate": 4.105263157894737e-07, "loss": 0.0, "num_tokens": 4111048.0, "reward": 0.2475000023841858, "reward_std": 0.2923952341079712, "rewards/reward_func/mean": 0.2475000023841858, "rewards/reward_func/std": 0.42844611406326294, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4629284869879484, "epoch": 1.70703125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14672014117240906, "learning_rate": 4.052631578947368e-07, "loss": 0.0, "num_tokens": 4115824.0, "reward": 0.24056249856948853, "reward_std": 0.274181067943573, "rewards/reward_func/mean": 0.24056249856948853, "rewards/reward_func/std": 0.44159868359565735, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5807988476008177, "epoch": 1.708984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19631755352020264, "learning_rate": 4.0000000000000003e-07, "loss": -0.0, "num_tokens": 4120472.0, "reward": 0.25, "reward_std": 0.5, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.4629100561141968, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5514065846800804, "epoch": 1.7109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2597048282623291, "learning_rate": 3.9473684210526315e-07, "loss": 0.0, "num_tokens": 4124832.0, "reward": 0.768750011920929, "reward_std": 0.4625000059604645, "rewards/reward_func/mean": 0.768750011920929, "rewards/reward_func/std": 0.42819181084632874, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6845806501805782, "epoch": 1.712890625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1643269956111908, "learning_rate": 3.8947368421052636e-07, "loss": 0.0, "num_tokens": 4129696.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.433909023180604, "epoch": 1.71484375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18823891878128052, "learning_rate": 3.8421052631578947e-07, "loss": 0.0, "num_tokens": 4134664.0, "reward": 0.11256249994039536, "reward_std": 0.2251249998807907, "rewards/reward_func/mean": 0.11256249994039536, "rewards/reward_func/std": 0.31335461139678955, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5910208597779274, "epoch": 1.716796875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22284384071826935, "learning_rate": 3.789473684210527e-07, "loss": 0.0, "num_tokens": 4139456.0, "reward": 0.6029999852180481, "reward_std": 0.5188616514205933, "rewards/reward_func/mean": 0.6029999852180481, "rewards/reward_func/std": 0.4995529353618622, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8369920663535595, "epoch": 1.71875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2716849446296692, "learning_rate": 3.736842105263158e-07, "loss": 0.0, "num_tokens": 4144216.0, "reward": 0.6250250339508057, "reward_std": 0.5000154972076416, "rewards/reward_func/mean": 0.6250250339508057, "rewards/reward_func/std": 0.4835914373397827, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5901597924530506, "epoch": 1.720703125, "frac_reward_zero_std": 0.5, "grad_norm": 0.15631148219108582, "learning_rate": 3.6842105263157896e-07, "loss": -0.0, "num_tokens": 4148832.0, "reward": 0.8573124408721924, "reward_std": 0.24137499928474426, "rewards/reward_func/mean": 0.8573124408721924, "rewards/reward_func/std": 0.3413558006286621, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.618029011413455, "epoch": 1.72265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2383870780467987, "learning_rate": 3.6315789473684213e-07, "loss": 0.0, "num_tokens": 4153624.0, "reward": 0.5831999778747559, "reward_std": 0.5035399198532104, "rewards/reward_func/mean": 0.5831999778747559, "rewards/reward_func/std": 0.4830016791820526, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.741577934473753, "epoch": 1.724609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.26154232025146484, "learning_rate": 3.578947368421053e-07, "loss": 0.0, "num_tokens": 4158272.0, "reward": 0.5, "reward_std": 0.5, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5345224738121033, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6781878937035799, "epoch": 1.7265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2297692745923996, "learning_rate": 3.5263157894736846e-07, "loss": -0.0, "num_tokens": 4163288.0, "reward": 0.45639997720718384, "reward_std": 0.5271494388580322, "rewards/reward_func/mean": 0.45639997720718384, "rewards/reward_func/std": 0.48832178115844727, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.770763710141182, "epoch": 1.728515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24367251992225647, "learning_rate": 3.4736842105263157e-07, "loss": -0.0, "num_tokens": 4168048.0, "reward": 0.49502497911453247, "reward_std": 0.5504268407821655, "rewards/reward_func/mean": 0.49502500891685486, "rewards/reward_func/std": 0.5097962021827698, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.757527232170105, "epoch": 1.73046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.21755702793598175, "learning_rate": 3.421052631578948e-07, "loss": -0.0, "num_tokens": 4172808.0, "reward": 0.6312999725341797, "reward_std": 0.24971508979797363, "rewards/reward_func/mean": 0.6312999725341797, "rewards/reward_func/std": 0.4959184527397156, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5246509797871113, "epoch": 1.732421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2858599126338959, "learning_rate": 3.368421052631579e-07, "loss": -0.0, "num_tokens": 4177456.0, "reward": 0.4972499907016754, "reward_std": 0.498220831155777, "rewards/reward_func/mean": 0.4972499907016754, "rewards/reward_func/std": 0.5316314101219177, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.788298238068819, "epoch": 1.734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.24566468596458435, "learning_rate": 3.315789473684211e-07, "loss": 0.0, "num_tokens": 4182456.0, "reward": 0.3424000144004822, "reward_std": 0.4942222833633423, "rewards/reward_func/mean": 0.3424000144004822, "rewards/reward_func/std": 0.472703218460083, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.625529520213604, "epoch": 1.736328125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 3.263157894736842e-07, "loss": 0.0, "num_tokens": 4187440.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5419940818101168, "epoch": 1.73828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24041259288787842, "learning_rate": 3.210526315789474e-07, "loss": -0.0, "num_tokens": 4192360.0, "reward": 0.5180374979972839, "reward_std": 0.4634375274181366, "rewards/reward_func/mean": 0.5180374979972839, "rewards/reward_func/std": 0.5047063231468201, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7481512352824211, "epoch": 1.740234375, "frac_reward_zero_std": 0.5, "grad_norm": 0.16213132441043854, "learning_rate": 3.1578947368421055e-07, "loss": 0.0, "num_tokens": 4196952.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8599461615085602, "epoch": 1.7421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2375580370426178, "learning_rate": 3.105263157894737e-07, "loss": -0.0, "num_tokens": 4201568.0, "reward": 0.8697500228881836, "reward_std": 0.2605000138282776, "rewards/reward_func/mean": 0.8697500228881836, "rewards/reward_func/std": 0.3517392575740814, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8133350238204002, "epoch": 1.744140625, "frac_reward_zero_std": 0.5, "grad_norm": 0.21218901872634888, "learning_rate": 3.052631578947369e-07, "loss": 0.0, "num_tokens": 4206320.0, "reward": 0.8271875381469727, "reward_std": 0.22562499344348907, "rewards/reward_func/mean": 0.8271875381469727, "rewards/reward_func/std": 0.3190819323062897, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.44859400019049644, "epoch": 1.74609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.18427495658397675, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "num_tokens": 4210984.0, "reward": 0.38631248474121094, "reward_std": 0.23885026574134827, "rewards/reward_func/mean": 0.38631248474121094, "rewards/reward_func/std": 0.5027945637702942, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6991079337894917, "epoch": 1.748046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2558084726333618, "learning_rate": 2.9473684210526315e-07, "loss": 0.0, "num_tokens": 4215472.0, "reward": 0.628125011920929, "reward_std": 0.5337572693824768, "rewards/reward_func/mean": 0.628125011920929, "rewards/reward_func/std": 0.5132507681846619, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8943945094943047, "epoch": 1.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.24125763773918152, "learning_rate": 2.894736842105263e-07, "loss": 0.0, "num_tokens": 4219952.0, "reward": 0.5051125288009644, "reward_std": 0.5653034448623657, "rewards/reward_func/mean": 0.5051125288009644, "rewards/reward_func/std": 0.5234332084655762, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6643506474792957, "epoch": 1.751953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.19916725158691406, "learning_rate": 2.842105263157895e-07, "loss": 0.0, "num_tokens": 4224704.0, "reward": 0.8607000112533569, "reward_std": 0.2525746822357178, "rewards/reward_func/mean": 0.8607000112533569, "rewards/reward_func/std": 0.34793561697006226, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6863343082368374, "epoch": 1.75390625, "frac_reward_zero_std": 0.5, "grad_norm": 0.18417027592658997, "learning_rate": 2.7894736842105264e-07, "loss": -0.0, "num_tokens": 4229464.0, "reward": 0.7471874952316284, "reward_std": 0.2749817669391632, "rewards/reward_func/mean": 0.7471874952316284, "rewards/reward_func/std": 0.45018935203552246, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.703750491142273, "epoch": 1.755859375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2679993808269501, "learning_rate": 2.736842105263158e-07, "loss": 0.0, "num_tokens": 4234464.0, "reward": 0.33969998359680176, "reward_std": 0.4910672605037689, "rewards/reward_func/mean": 0.33969998359680176, "rewards/reward_func/std": 0.46902716159820557, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8419626951217651, "epoch": 1.7578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.28019896149635315, "learning_rate": 2.6842105263157897e-07, "loss": 0.0, "num_tokens": 4238976.0, "reward": 0.7445999979972839, "reward_std": 0.49645256996154785, "rewards/reward_func/mean": 0.7445999979972839, "rewards/reward_func/std": 0.45967379212379456, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8848395217210054, "epoch": 1.759765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2501536011695862, "learning_rate": 2.6315789473684213e-07, "loss": 0.0, "num_tokens": 4243728.0, "reward": 0.73808753490448, "reward_std": 0.4636053442955017, "rewards/reward_func/mean": 0.7380874752998352, "rewards/reward_func/std": 0.4300731420516968, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5995431952178478, "epoch": 1.76171875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1700550615787506, "learning_rate": 2.578947368421053e-07, "loss": -0.0, "num_tokens": 4248720.0, "reward": 0.2184000015258789, "reward_std": 0.2522551119327545, "rewards/reward_func/mean": 0.2184000015258789, "rewards/reward_func/std": 0.4044714868068695, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8698443882167339, "epoch": 1.763671875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2777104675769806, "learning_rate": 2.5263157894736846e-07, "loss": -0.0, "num_tokens": 4253256.0, "reward": 0.504687488079071, "reward_std": 0.5720410346984863, "rewards/reward_func/mean": 0.504687488079071, "rewards/reward_func/std": 0.5296536087989807, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7046304792165756, "epoch": 1.765625, "frac_reward_zero_std": 0.0, "grad_norm": 0.23841030895709991, "learning_rate": 2.473684210526316e-07, "loss": 0.0, "num_tokens": 4258024.0, "reward": 0.6187999844551086, "reward_std": 0.5329432487487793, "rewards/reward_func/mean": 0.6187999844551086, "rewards/reward_func/std": 0.5125207901000977, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7159217670559883, "epoch": 1.767578125, "frac_reward_zero_std": 0.0, "grad_norm": 0.2175939530134201, "learning_rate": 2.4210526315789473e-07, "loss": 0.0, "num_tokens": 4263024.0, "reward": 0.3587000072002411, "reward_std": 0.5176094770431519, "rewards/reward_func/mean": 0.3587000072002411, "rewards/reward_func/std": 0.4951927065849304, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4197417329996824, "epoch": 1.76953125, "frac_reward_zero_std": 0.0, "grad_norm": 0.18832392990589142, "learning_rate": 2.3684210526315792e-07, "loss": 0.0, "num_tokens": 4267968.0, "reward": 0.012500000186264515, "reward_std": 0.014433757402002811, "rewards/reward_func/mean": 0.012500000186264515, "rewards/reward_func/std": 0.013363063335418701, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7613006345927715, "epoch": 1.771484375, "frac_reward_zero_std": 0.5, "grad_norm": 0.17875666916370392, "learning_rate": 2.315789473684211e-07, "loss": -0.0, "num_tokens": 4272328.0, "reward": 0.9947500228881836, "reward_std": 0.010500003583729267, "rewards/reward_func/mean": 0.9947500228881836, "rewards/reward_func/std": 0.014849240891635418, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7562070265412331, "epoch": 1.7734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2620117664337158, "learning_rate": 2.2631578947368425e-07, "loss": 0.0, "num_tokens": 4276768.0, "reward": 0.6265624761581421, "reward_std": 0.5368822813034058, "rewards/reward_func/mean": 0.6265624761581421, "rewards/reward_func/std": 0.515407145023346, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.735223077237606, "epoch": 1.775390625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24371092021465302, "learning_rate": 2.2105263157894736e-07, "loss": -0.0, "num_tokens": 4281480.0, "reward": 0.9846999645233154, "reward_std": 0.03060000017285347, "rewards/reward_func/mean": 0.9846999645233154, "rewards/reward_func/std": 0.030131708830595016, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.888181284070015, "epoch": 1.77734375, "frac_reward_zero_std": 0.0, "grad_norm": 0.2600753605365753, "learning_rate": 2.1578947368421053e-07, "loss": 0.0, "num_tokens": 4285840.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5345224738121033, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6173073314130306, "epoch": 1.779296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.22857871651649475, "learning_rate": 2.105263157894737e-07, "loss": -0.0, "num_tokens": 4290496.0, "reward": 0.37229999899864197, "reward_std": 0.5355914831161499, "rewards/reward_func/mean": 0.37229999899864197, "rewards/reward_func/std": 0.5138660669326782, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5189631078392267, "epoch": 1.78125, "frac_reward_zero_std": 0.5, "grad_norm": 0.14599941670894623, "learning_rate": 2.0526315789473685e-07, "loss": -0.0, "num_tokens": 4295008.0, "reward": 0.8134000301361084, "reward_std": 0.23240000009536743, "rewards/reward_func/mean": 0.8134000301361084, "rewards/reward_func/std": 0.3286632299423218, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8033823296427727, "epoch": 1.783203125, "frac_reward_zero_std": 0.5, "grad_norm": 0.18053551018238068, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "num_tokens": 4299768.0, "reward": 0.8362500071525574, "reward_std": 0.22793914377689362, "rewards/reward_func/mean": 0.8362500071525574, "rewards/reward_func/std": 0.3183859884738922, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6970736794173717, "epoch": 1.78515625, "frac_reward_zero_std": 0.5, "grad_norm": 0.16842643916606903, "learning_rate": 1.9473684210526318e-07, "loss": -0.0, "num_tokens": 4304520.0, "reward": 0.8224999904632568, "reward_std": 0.23499999940395355, "rewards/reward_func/mean": 0.8224999904632568, "rewards/reward_func/std": 0.33234018087387085, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6607658788561821, "epoch": 1.787109375, "frac_reward_zero_std": 0.0, "grad_norm": 0.22088974714279175, "learning_rate": 1.8947368421052634e-07, "loss": 0.0, "num_tokens": 4309296.0, "reward": 0.7371000051498413, "reward_std": 0.28240811824798584, "rewards/reward_func/mean": 0.7371000051498413, "rewards/reward_func/std": 0.4553808271884918, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6025678031146526, "epoch": 1.7890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2665260136127472, "learning_rate": 1.8421052631578948e-07, "loss": 0.0, "num_tokens": 4314088.0, "reward": 0.8602499961853027, "reward_std": 0.24355539679527283, "rewards/reward_func/mean": 0.8602499961853027, "rewards/reward_func/std": 0.32780489325523376, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.421773798763752, "epoch": 1.791015625, "frac_reward_zero_std": 0.5, "grad_norm": 0.12458179891109467, "learning_rate": 1.7894736842105265e-07, "loss": 0.0, "num_tokens": 4319040.0, "reward": 0.0031250000465661287, "reward_std": 0.0062500000931322575, "rewards/reward_func/mean": 0.0031250000465661287, "rewards/reward_func/std": 0.008838835172355175, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8600736558437347, "epoch": 1.79296875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2437726855278015, "learning_rate": 1.7368421052631578e-07, "loss": -0.0, "num_tokens": 4323544.0, "reward": 0.5066750049591064, "reward_std": 0.4920022189617157, "rewards/reward_func/mean": 0.5066750049591064, "rewards/reward_func/std": 0.5217258930206299, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7552273385226727, "epoch": 1.794921875, "frac_reward_zero_std": 0.0, "grad_norm": 0.23656296730041504, "learning_rate": 1.6842105263157895e-07, "loss": -0.0, "num_tokens": 4328408.0, "reward": 0.8660625219345093, "reward_std": 0.2540762424468994, "rewards/reward_func/mean": 0.8660625219345093, "rewards/reward_func/std": 0.34541285037994385, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6802506037056446, "epoch": 1.796875, "frac_reward_zero_std": 0.5, "grad_norm": 0.1381627321243286, "learning_rate": 1.631578947368421e-07, "loss": 0.0, "num_tokens": 4333080.0, "reward": 0.875, "reward_std": 0.25, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3535533845424652, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.19835966220125556, "epoch": 1.798828125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1.5789473684210527e-07, "loss": 0.0, "num_tokens": 4337592.0, "reward": 0.9296000003814697, "reward_std": 0.0, "rewards/reward_func/mean": 0.9296000003814697, "rewards/reward_func/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8462485000491142, "epoch": 1.80078125, "frac_reward_zero_std": 0.0, "grad_norm": 0.24740563333034515, "learning_rate": 1.5263157894736844e-07, "loss": 0.0, "num_tokens": 4342608.0, "reward": 0.47290000319480896, "reward_std": 0.5461318492889404, "rewards/reward_func/mean": 0.47290000319480896, "rewards/reward_func/std": 0.5058882236480713, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4742927271872759, "epoch": 1.802734375, "frac_reward_zero_std": 0.0, "grad_norm": 1.2252196073532104, "learning_rate": 1.4736842105263158e-07, "loss": 0.0, "num_tokens": 4347248.0, "reward": 0.3668999969959259, "reward_std": 0.5270397663116455, "rewards/reward_func/mean": 0.3668999969959259, "rewards/reward_func/std": 0.5063701272010803, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6519879586994648, "epoch": 1.8046875, "frac_reward_zero_std": 0.0, "grad_norm": 0.25893595814704895, "learning_rate": 1.4210526315789474e-07, "loss": -0.0, "num_tokens": 4352264.0, "reward": 0.23256249725818634, "reward_std": 0.4609765410423279, "rewards/reward_func/mean": 0.23256249725818634, "rewards/reward_func/std": 0.4271462559700012, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8685908392071724, "epoch": 1.806640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.24174730479717255, "learning_rate": 1.368421052631579e-07, "loss": -0.0, "num_tokens": 4356792.0, "reward": 0.6343749761581421, "reward_std": 0.5239908695220947, "rewards/reward_func/mean": 0.6343749761581421, "rewards/reward_func/std": 0.5047431588172913, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.584613062441349, "epoch": 1.80859375, "frac_reward_zero_std": 0.5, "grad_norm": 0.15689092874526978, "learning_rate": 1.3157894736842107e-07, "loss": 0.0, "num_tokens": 4361448.0, "reward": 0.760937511920929, "reward_std": 0.27662280201911926, "rewards/reward_func/mean": 0.760937511920929, "rewards/reward_func/std": 0.4432750344276428, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6101187616586685, "epoch": 1.810546875, "frac_reward_zero_std": 0.5, "grad_norm": 0.17513932287693024, "learning_rate": 1.2631578947368423e-07, "loss": 0.0, "num_tokens": 4366136.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.4629100561141968, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6578428782522678, "epoch": 1.8125, "frac_reward_zero_std": 0.0, "grad_norm": 0.19013863801956177, "learning_rate": 1.2105263157894737e-07, "loss": 0.0, "num_tokens": 4371112.0, "reward": 0.21320000290870667, "reward_std": 0.42640000581741333, "rewards/reward_func/mean": 0.21320000290870667, "rewards/reward_func/std": 0.3952178359031677, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.505165308713913, "epoch": 1.814453125, "frac_reward_zero_std": 0.0, "grad_norm": 0.9573839902877808, "learning_rate": 1.1578947368421054e-07, "loss": -0.0, "num_tokens": 4375760.0, "reward": 0.6168624758720398, "reward_std": 0.5261497497558594, "rewards/reward_func/mean": 0.6168625354766846, "rewards/reward_func/std": 0.5074918866157532, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.874191090464592, "epoch": 1.81640625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2154332995414734, "learning_rate": 1.1052631578947368e-07, "loss": 0.0, "num_tokens": 4380264.0, "reward": 0.6285499930381775, "reward_std": 0.2513039708137512, "rewards/reward_func/mean": 0.6285499930381775, "rewards/reward_func/std": 0.5067988634109497, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.9102950617671013, "epoch": 1.818359375, "frac_reward_zero_std": 0.5, "grad_norm": 0.1660175621509552, "learning_rate": 1.0526315789473685e-07, "loss": 0.0, "num_tokens": 4384872.0, "reward": 0.8723000288009644, "reward_std": 0.24825221300125122, "rewards/reward_func/mean": 0.8723000288009644, "rewards/reward_func/std": 0.3525434732437134, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.564951941370964, "epoch": 1.8203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23926304280757904, "learning_rate": 1.0000000000000001e-07, "loss": -0.0, "num_tokens": 4389696.0, "reward": 0.1210624948143959, "reward_std": 0.2421249896287918, "rewards/reward_func/mean": 0.1210624948143959, "rewards/reward_func/std": 0.33739402890205383, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4994359500706196, "epoch": 1.822265625, "frac_reward_zero_std": 0.0, "grad_norm": 0.25481706857681274, "learning_rate": 9.473684210526317e-08, "loss": 0.0, "num_tokens": 4394352.0, "reward": 0.629212498664856, "reward_std": 0.49713975191116333, "rewards/reward_func/mean": 0.629212498664856, "rewards/reward_func/std": 0.47718602418899536, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4711499195545912, "epoch": 1.82421875, "frac_reward_zero_std": 0.0, "grad_norm": 0.19426177442073822, "learning_rate": 8.947368421052632e-08, "loss": -0.0, "num_tokens": 4399000.0, "reward": 0.629349946975708, "reward_std": 0.5124804973602295, "rewards/reward_func/mean": 0.6293500065803528, "rewards/reward_func/std": 0.4940544664859772, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8083347305655479, "epoch": 1.826171875, "frac_reward_zero_std": 0.5, "grad_norm": 0.17153169214725494, "learning_rate": 8.421052631578947e-08, "loss": 0.0, "num_tokens": 4403608.0, "reward": 0.7593749761581421, "reward_std": 0.27827125787734985, "rewards/reward_func/mean": 0.7593749761581421, "rewards/reward_func/std": 0.4460016191005707, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6882787756621838, "epoch": 1.828125, "frac_reward_zero_std": 0.0, "grad_norm": 0.26333948969841003, "learning_rate": 7.894736842105264e-08, "loss": 0.0, "num_tokens": 4408400.0, "reward": 0.5950000286102295, "reward_std": 0.5117709040641785, "rewards/reward_func/mean": 0.5950000286102295, "rewards/reward_func/std": 0.4931241571903229, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8129791170358658, "epoch": 1.830078125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.368421052631579e-08, "loss": 0.0, "num_tokens": 4412896.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.45138827711343765, "epoch": 1.83203125, "frac_reward_zero_std": 0.0, "grad_norm": 0.18060848116874695, "learning_rate": 6.842105263157895e-08, "loss": 0.0, "num_tokens": 4417680.0, "reward": 0.3540624976158142, "reward_std": 0.5053315162658691, "rewards/reward_func/mean": 0.3540624976158142, "rewards/reward_func/std": 0.4852207601070404, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6235154792666435, "epoch": 1.833984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.16618505120277405, "learning_rate": 6.315789473684211e-08, "loss": -0.0, "num_tokens": 4422320.0, "reward": 0.7527874708175659, "reward_std": 0.4729631543159485, "rewards/reward_func/mean": 0.7527874708175659, "rewards/reward_func/std": 0.4385221302509308, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.32566132955253124, "epoch": 1.8359375, "frac_reward_zero_std": 0.0, "grad_norm": 0.1724817156791687, "learning_rate": 5.789473684210527e-08, "loss": 0.0, "num_tokens": 4426960.0, "reward": 0.38749998807907104, "reward_std": 0.5304675102233887, "rewards/reward_func/mean": 0.38750001788139343, "rewards/reward_func/std": 0.5074885487556458, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.3569133039563894, "epoch": 1.837890625, "frac_reward_zero_std": 0.0, "grad_norm": 0.17535947263240814, "learning_rate": 5.263157894736842e-08, "loss": 0.0, "num_tokens": 4431448.0, "reward": 0.6987625360488892, "reward_std": 0.4616749882698059, "rewards/reward_func/mean": 0.6987625360488892, "rewards/reward_func/std": 0.4274410605430603, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5731943100690842, "epoch": 1.83984375, "frac_reward_zero_std": 0.0, "grad_norm": 0.19585168361663818, "learning_rate": 4.7368421052631586e-08, "loss": 0.0, "num_tokens": 4436104.0, "reward": 0.3721874952316284, "reward_std": 0.5249817371368408, "rewards/reward_func/mean": 0.3721874952316284, "rewards/reward_func/std": 0.5038166046142578, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.5804979503154755, "epoch": 1.841796875, "frac_reward_zero_std": 0.5, "grad_norm": 0.15701980888843536, "learning_rate": 4.2105263157894737e-08, "loss": 0.0, "num_tokens": 4441096.0, "reward": 0.22850000858306885, "reward_std": 0.26406246423721313, "rewards/reward_func/mean": 0.22850000858306885, "rewards/reward_func/std": 0.4233279824256897, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.8702232874929905, "epoch": 1.84375, "frac_reward_zero_std": 0.0, "grad_norm": 0.30385154485702515, "learning_rate": 3.6842105263157894e-08, "loss": 0.0, "num_tokens": 4445896.0, "reward": 0.5129749774932861, "reward_std": 0.47286224365234375, "rewards/reward_func/mean": 0.5129749774932861, "rewards/reward_func/std": 0.5019313097000122, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6899589747190475, "epoch": 1.845703125, "frac_reward_zero_std": 0.0, "grad_norm": 0.23425976932048798, "learning_rate": 3.157894736842106e-08, "loss": 0.0, "num_tokens": 4450384.0, "reward": 0.8695999979972839, "reward_std": 0.25623539090156555, "rewards/reward_func/mean": 0.8695999979972839, "rewards/reward_func/std": 0.35150691866874695, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.7242371030151844, "epoch": 1.84765625, "frac_reward_zero_std": 0.5, "grad_norm": 0.18696576356887817, "learning_rate": 2.631578947368421e-08, "loss": 0.0, "num_tokens": 4454728.0, "reward": 0.8765624761581421, "reward_std": 0.24687500298023224, "rewards/reward_func/mean": 0.8765624761581421, "rewards/reward_func/std": 0.34913399815559387, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6600115075707436, "epoch": 1.849609375, "frac_reward_zero_std": 0.0, "grad_norm": 0.26843783259391785, "learning_rate": 2.1052631578947368e-08, "loss": -0.0, "num_tokens": 4459744.0, "reward": 0.350600004196167, "reward_std": 0.4967568516731262, "rewards/reward_func/mean": 0.350600004196167, "rewards/reward_func/std": 0.4848525822162628, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.6009881878271699, "epoch": 1.8515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.2861350476741791, "learning_rate": 1.578947368421053e-08, "loss": -0.0, "num_tokens": 4464248.0, "reward": 0.5809999704360962, "reward_std": 0.5007524490356445, "rewards/reward_func/mean": 0.5809999704360962, "rewards/reward_func/std": 0.4811137020587921, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.4016810180619359, "epoch": 1.853515625, "frac_reward_zero_std": 0.0, "grad_norm": 0.19072452187538147, "learning_rate": 1.0526315789473684e-08, "loss": -0.0, "num_tokens": 4468744.0, "reward": 0.5809999704360962, "reward_std": 0.5007524490356445, "rewards/reward_func/mean": 0.5809999704360962, "rewards/reward_func/std": 0.4811137020587921, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 384.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 384.0, "completions/min_terminated_length": 0.0, "entropy": 0.92277492582798, "epoch": 1.85546875, "frac_reward_zero_std": 0.0, "grad_norm": 0.2709886431694031, "learning_rate": 5.263157894736842e-09, "loss": 0.0, "num_tokens": 4473248.0, "reward": 0.7473000288009644, "reward_std": 0.4982522130012512, "rewards/reward_func/mean": 0.7473000288009644, "rewards/reward_func/std": 0.4613037705421448, "step": 950 } ], "logging_steps": 1, "max_steps": 950, "num_input_tokens_seen": 4473248, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }