{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 68.5, "completions/mean_terminated_length": 68.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.008, "format_failures": 2.0, "grad_norm": 14.535454750061035, "learning_rate": 0.0, "loss": -0.1432, "num_tokens": 7296.0, "reward": 0.125, "reward_std": 0.8345229625701904, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 68.5, "completions/mean_terminated_length": 68.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.016, "format_failures": 2.0, "grad_norm": 14.53545093536377, "learning_rate": 1e-07, "loss": -0.1432, "num_tokens": 14592.0, "reward": 0.125, "reward_std": 0.8345229625701904, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 55.875, "completions/mean_terminated_length": 55.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.024, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 2e-07, "loss": 0.0, "num_tokens": 19480.0, "reward": 0.0, "reward_std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 108.625, "completions/mean_terminated_length": 108.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.032, "format_failures": 0.0, "grad_norm": 2.9238641262054443, "learning_rate": 3e-07, "loss": 0.0659, "num_tokens": 27576.0, "reward": 0.25, "reward_std": 0.4629100561141968, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 95.125, "completions/mean_terminated_length": 95.125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.04, "format_failures": 0.0, "grad_norm": 4.841046333312988, "learning_rate": 4e-07, "loss": -0.1454, "num_tokens": 36280.0, "reward": 0.375, "reward_std": 0.5175491571426392, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.048, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 47208.0, "reward": 0.0, "reward_std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.056, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 6e-07, "loss": 0.0, "num_tokens": 57272.0, "reward": 0.0, "reward_std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.064, "format_failures": 0.0, "grad_norm": 3.39980149269104, "learning_rate": 7e-07, "loss": -0.0223, "num_tokens": 62224.0, "reward": 0.25, "reward_std": 0.4629100561141968, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.072, "format_failures": 0.0, "grad_norm": 2.5628979206085205, "learning_rate": 8e-07, "loss": 0.0208, "num_tokens": 70240.0, "reward": 0.528124988079071, "reward_std": 0.46587809920310974, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 110.0, "completions/mean_terminated_length": 110.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.08, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 78024.0, "reward": 0.0, "reward_std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 87.125, "completions/mean_terminated_length": 87.125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.088, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 86080.0, "reward": 0.0, "reward_std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 87.125, "completions/mean_terminated_length": 87.125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.096, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 93792.0, "reward": 0.0, "reward_std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 77.375, "completions/mean_terminated_length": 77.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.104, "format_failures": 0.0, "grad_norm": 6.721037864685059, "learning_rate": 1e-06, "loss": -0.1101, "num_tokens": 99216.0, "reward": 0.625, "reward_std": 0.5175491571426392, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.112, "format_failures": 0.0, "grad_norm": 0.8888375759124756, "learning_rate": 1e-06, "loss": -0.027, "num_tokens": 108576.0, "reward": 0.0625, "reward_std": 0.12400397658348083, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.12, "format_failures": 0.0, "grad_norm": 3.044679641723633, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 119720.0, "reward": 0.09375, "reward_std": 0.2651650309562683, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.128, "format_failures": 0.0, "grad_norm": 3.0491764545440674, "learning_rate": 1e-06, "loss": -0.0739, "num_tokens": 125832.0, "reward": 0.33392858505249023, "reward_std": 0.3862760365009308, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.136, "format_failures": 0.0, "grad_norm": 13.049592018127441, "learning_rate": 1e-06, "loss": -0.1828, "num_tokens": 131136.0, "reward": 0.375, "reward_std": 0.5175491571426392, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 41.375, "completions/mean_terminated_length": 41.375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.144, "format_failures": 0.0, "grad_norm": 15.581900596618652, "learning_rate": 1e-06, "loss": -0.0101, "num_tokens": 134976.0, "reward": 0.3333333432674408, "reward_std": 0.4714045226573944, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.152, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 146816.0, "reward": 0.0, "reward_std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.16, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 153784.0, "reward": 0.0, "reward_std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 79.25, "completions/mean_terminated_length": 79.25, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.168, "format_failures": 0.0, "grad_norm": 3.057511806488037, "learning_rate": 1e-06, "loss": 0.0983, "num_tokens": 158992.0, "reward": 0.8333333730697632, "reward_std": 0.25197628140449524, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 60.875, "completions/mean_terminated_length": 60.875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.176, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 163256.0, "reward": 0.0, "reward_std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.184, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 175192.0, "reward": 0.0, "reward_std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 86.75, "completions/mean_terminated_length": 86.75, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.192, "format_failures": 1.0, "grad_norm": 14.607234954833984, "learning_rate": 1e-06, "loss": -0.1881, "num_tokens": 182600.0, "reward": 0.16249999403953552, "reward_std": 0.6139742136001587, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2, "format_failures": 2.0, "grad_norm": 13.306657791137695, "learning_rate": 1e-06, "loss": -0.2265, "num_tokens": 191192.0, "reward": 0.11736111342906952, "reward_std": 0.7586389183998108, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 70.625, "completions/mean_terminated_length": 70.625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.208, "format_failures": 0.0, "grad_norm": 9.159988403320312, "learning_rate": 1e-06, "loss": -0.1783, "num_tokens": 196864.0, "reward": 0.25, "reward_std": 0.4629100561141968, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 113.5, "completions/mean_terminated_length": 113.5, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.216, "format_failures": 0.0, "grad_norm": 3.2562782764434814, "learning_rate": 1e-06, "loss": 0.1733, "num_tokens": 203632.0, "reward": 0.7690476179122925, "reward_std": 0.34529557824134827, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 94.375, "completions/mean_terminated_length": 94.375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.224, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 214720.0, "reward": 0.0, "reward_std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 76.375, "completions/mean_terminated_length": 76.375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.232, "format_failures": 0.0, "grad_norm": 8.231141090393066, "learning_rate": 1e-06, "loss": -0.0223, "num_tokens": 220224.0, "reward": 0.7250000238418579, "reward_std": 0.4527692496776581, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.24, "format_failures": 1.0, "grad_norm": 6.005310535430908, "learning_rate": 1e-06, "loss": -0.1002, "num_tokens": 227528.0, "reward": 0.1041666641831398, "reward_std": 0.6103439927101135, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 115.125, "completions/mean_terminated_length": 115.125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.248, "format_failures": 0.0, "grad_norm": 2.13883113861084, "learning_rate": 1e-06, "loss": -0.0759, "num_tokens": 233984.0, "reward": 0.07500000298023224, "reward_std": 0.14880476891994476, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 99.125, "completions/mean_terminated_length": 99.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.256, "format_failures": 0.0, "grad_norm": 5.693840026855469, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 240552.0, "reward": 0.15000000596046448, "reward_std": 0.3505098223686218, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 58.875, "completions/mean_terminated_length": 58.875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.264, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 246744.0, "reward": 0.0, "reward_std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 99.75, "completions/mean_terminated_length": 99.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.272, "format_failures": 0.0, "grad_norm": 2.652855157852173, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 255552.0, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 114.375, "completions/mean_terminated_length": 114.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.28, "format_failures": 0.0, "grad_norm": 4.276206970214844, "learning_rate": 1e-06, "loss": -0.0344, "num_tokens": 263104.0, "reward": 0.1666666716337204, "reward_std": 0.30860671401023865, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 22.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.288, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 268296.0, "reward": 0.0, "reward_std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 70.0, "completions/mean_terminated_length": 70.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.296, "format_failures": 1.0, "grad_norm": 50.614295959472656, "learning_rate": 1e-06, "loss": -0.0643, "num_tokens": 274040.0, "reward": 0.5, "reward_std": 0.7559289336204529, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.304, "format_failures": 0.0, "grad_norm": 3.728706121444702, "learning_rate": 1e-06, "loss": 0.0654, "num_tokens": 282408.0, "reward": 0.4791666865348816, "reward_std": 0.46664538979530334, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 114.375, "completions/mean_terminated_length": 114.375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.312, "format_failures": 0.0, "grad_norm": 10.129213333129883, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 290960.0, "reward": 0.2750000059604645, "reward_std": 0.3761754035949707, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.32, "format_failures": 0.0, "grad_norm": 8.080669403076172, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 299832.0, "reward": 0.32499998807907104, "reward_std": 0.46521884202957153, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 99.625, "completions/mean_terminated_length": 99.625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.328, "format_failures": 0.0, "grad_norm": 27.497039794921875, "learning_rate": 1e-06, "loss": 0.1274, "num_tokens": 307848.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 41.0, "completions/mean_terminated_length": 41.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.336, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 312376.0, "reward": 1.0, "reward_std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 81.75, "completions/mean_terminated_length": 81.75, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.344, "format_failures": 0.0, "grad_norm": 20.424545288085938, "learning_rate": 1e-06, "loss": 0.1492, "num_tokens": 318888.0, "reward": 0.7395833730697632, "reward_std": 0.40197303891181946, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.352, "format_failures": 0.0, "grad_norm": 1.9911680221557617, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 327048.0, "reward": 0.04782196879386902, "reward_std": 0.0690462738275528, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.36, "format_failures": 0.0, "grad_norm": 31.368621826171875, "learning_rate": 1e-06, "loss": 0.0417, "num_tokens": 332936.0, "reward": 0.71875, "reward_std": 0.45193037390708923, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 84.375, "completions/mean_terminated_length": 84.375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.368, "format_failures": 0.0, "grad_norm": 3.144503593444824, "learning_rate": 1e-06, "loss": 0.0339, "num_tokens": 338896.0, "reward": 0.9437500238418579, "reward_std": 0.10500850528478622, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.376, "format_failures": 0.0, "grad_norm": 7.377927303314209, "learning_rate": 1e-06, "loss": 0.1124, "num_tokens": 348024.0, "reward": 0.20000000298023224, "reward_std": 0.38544967770576477, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 100.25, "completions/mean_terminated_length": 100.25, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.384, "format_failures": 0.0, "grad_norm": 10.651869773864746, "learning_rate": 1e-06, "loss": 0.0577, "num_tokens": 357192.0, "reward": 0.125, "reward_std": 0.3535533845424652, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.392, "format_failures": 0.0, "grad_norm": 8.277122497558594, "learning_rate": 1e-06, "loss": 0.0829, "num_tokens": 365064.0, "reward": 0.7291666269302368, "reward_std": 0.27970364689826965, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.4, "format_failures": 0.0, "grad_norm": 21.047945022583008, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 369512.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.408, "format_failures": 0.0, "grad_norm": 10.351432800292969, "learning_rate": 1e-06, "loss": 0.1456, "num_tokens": 377488.0, "reward": 0.6888439655303955, "reward_std": 0.3107835352420807, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.416, "format_failures": 0.0, "grad_norm": 3.8217451572418213, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 384976.0, "reward": 0.9404761791229248, "reward_std": 0.121405228972435, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.424, "format_failures": 0.0, "grad_norm": 58.9612922668457, "learning_rate": 1e-06, "loss": 0.1708, "num_tokens": 394360.0, "reward": 0.625, "reward_std": 0.5175491571426392, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.432, "format_failures": 0.0, "grad_norm": 23.820772171020508, "learning_rate": 1e-06, "loss": -0.0159, "num_tokens": 402064.0, "reward": 0.625, "reward_std": 0.5175491571426392, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.44, "format_failures": 0.0, "grad_norm": 35.03224182128906, "learning_rate": 1e-06, "loss": 0.2828, "num_tokens": 409640.0, "reward": 0.71875, "reward_std": 0.3907092809677124, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.448, "format_failures": 0.0, "grad_norm": 13.870576858520508, "learning_rate": 1e-06, "loss": -0.0312, "num_tokens": 425384.0, "reward": 0.4104166626930237, "reward_std": 0.4187033772468567, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 75.125, "completions/mean_terminated_length": 75.125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.456, "format_failures": 0.0, "grad_norm": 53.12404251098633, "learning_rate": 1e-06, "loss": -0.1243, "num_tokens": 433136.0, "reward": 0.53125, "reward_std": 0.5077524185180664, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 73.125, "completions/mean_terminated_length": 73.125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.464, "format_failures": 0.0, "grad_norm": 39.553680419921875, "learning_rate": 1e-06, "loss": 0.0804, "num_tokens": 438800.0, "reward": 0.7589285373687744, "reward_std": 0.3720118999481201, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.472, "format_failures": 0.0, "grad_norm": 60.6335334777832, "learning_rate": 1e-06, "loss": 0.2012, "num_tokens": 448896.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 75.875, "completions/mean_terminated_length": 75.875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.48, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 454024.0, "reward": 0.0, "reward_std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 92.75, "completions/mean_terminated_length": 92.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.488, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 460824.0, "reward": 1.0, "reward_std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.496, "format_failures": 0.0, "grad_norm": 20.28946876525879, "learning_rate": 1e-06, "loss": 0.1128, "num_tokens": 469056.0, "reward": 0.382438600063324, "reward_std": 0.315915584564209, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 45.625, "completions/mean_terminated_length": 45.625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.504, "format_failures": 0.0, "grad_norm": 15.72630786895752, "learning_rate": 1e-06, "loss": 0.0539, "num_tokens": 473704.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 92.875, "completions/mean_terminated_length": 92.875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.512, "format_failures": 0.0, "grad_norm": 43.74641418457031, "learning_rate": 1e-06, "loss": 0.1736, "num_tokens": 480968.0, "reward": 0.7833333015441895, "reward_std": 0.4074699282646179, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 103.5, "completions/mean_terminated_length": 103.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.52, "format_failures": 0.0, "grad_norm": 15.517630577087402, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 487304.0, "reward": 0.1875, "reward_std": 0.2587745785713196, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.528, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 494024.0, "reward": 1.0, "reward_std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.536, "format_failures": 0.0, "grad_norm": 5.775947570800781, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 498864.0, "reward": 0.4791666865348816, "reward_std": 0.058925557881593704, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 107.125, "completions/mean_terminated_length": 107.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.544, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 503432.0, "reward": 1.0, "reward_std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 83.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.552, "format_failures": 0.0, "grad_norm": 60.845848083496094, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 509880.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 63.0, "completions/mean_terminated_length": 63.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.56, "format_failures": 0.0, "grad_norm": 13.625865936279297, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 516736.0, "reward": 0.875, "reward_std": 0.1725163757801056, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 450.75, "completions/mean_terminated_length": 450.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.568, "format_failures": 0.0, "grad_norm": 7.975990295410156, "learning_rate": 1e-06, "loss": -0.0885, "num_tokens": 530072.0, "reward": 0.625, "reward_std": 0.4154745042324066, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.576, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 534528.0, "reward": 0.0, "reward_std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 80.625, "completions/mean_terminated_length": 80.625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.584, "format_failures": 0.0, "grad_norm": 11.076742172241211, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 543136.0, "reward": 0.02083333395421505, "reward_std": 0.0589255690574646, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 63.5, "completions/mean_terminated_length": 63.5, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.592, "format_failures": 0.0, "grad_norm": 47.686092376708984, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 547944.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 127.875, "completions/mean_terminated_length": 127.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.6, "format_failures": 0.0, "grad_norm": 71.78514099121094, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 554800.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 86.875, "completions/mean_terminated_length": 86.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.608, "format_failures": 0.0, "grad_norm": 33.52473831176758, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 562336.0, "reward": 0.484375, "reward_std": 0.46501487493515015, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 77.625, "completions/mean_terminated_length": 77.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.616, "format_failures": 0.0, "grad_norm": 31.954763412475586, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 568208.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 89.875, "completions/mean_terminated_length": 89.875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.624, "format_failures": 0.0, "grad_norm": 71.36677551269531, "learning_rate": 1e-06, "loss": 0.2247, "num_tokens": 580376.0, "reward": 0.9090908765792847, "reward_std": 0.2571297585964203, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 83.5, "completions/mean_terminated_length": 83.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.632, "format_failures": 0.0, "grad_norm": 67.83323669433594, "learning_rate": 1e-06, "loss": 0.2192, "num_tokens": 586704.0, "reward": 0.5928571224212646, "reward_std": 0.42122966051101685, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 72.625, "completions/mean_terminated_length": 72.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.64, "format_failures": 0.0, "grad_norm": 32.55586242675781, "learning_rate": 1e-06, "loss": 0.091, "num_tokens": 591600.0, "reward": 0.7083333730697632, "reward_std": 0.4520675837993622, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 92.875, "completions/mean_terminated_length": 92.875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.648, "format_failures": 0.0, "grad_norm": 40.66396713256836, "learning_rate": 1e-06, "loss": -0.0216, "num_tokens": 597704.0, "reward": 0.675000011920929, "reward_std": 0.46521881222724915, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 68.0, "completions/mean_terminated_length": 68.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.656, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 602704.0, "reward": 1.0, "reward_std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 76.25, "completions/mean_terminated_length": 76.25, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.664, "format_failures": 0.0, "grad_norm": 36.38127517700195, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 608840.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 86.0, "completions/mean_terminated_length": 86.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.672, "format_failures": 0.0, "grad_norm": 16.564855575561523, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 614896.0, "reward": 0.8458333015441895, "reward_std": 0.18934932351112366, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.68, "format_failures": 0.0, "grad_norm": 47.10960388183594, "learning_rate": 1e-06, "loss": 0.258, "num_tokens": 623912.0, "reward": 0.6875, "reward_std": 0.45806270837783813, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.688, "format_failures": 0.0, "grad_norm": 49.59060287475586, "learning_rate": 1e-06, "loss": 0.1332, "num_tokens": 629824.0, "reward": 0.39895835518836975, "reward_std": 0.33095693588256836, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 68.125, "completions/mean_terminated_length": 68.125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.696, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 634944.0, "reward": 1.0, "reward_std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 101.75, "completions/mean_terminated_length": 101.75, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.704, "format_failures": 0.0, "grad_norm": 57.84336471557617, "learning_rate": 1e-06, "loss": -0.0312, "num_tokens": 642088.0, "reward": 0.46875, "reward_std": 0.4712729752063751, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 50.625, "completions/mean_terminated_length": 50.625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.712, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 647456.0, "reward": 1.0, "reward_std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.375, "completions/mean_terminated_length": 50.375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.72, "format_failures": 0.0, "grad_norm": 35.49184036254883, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 652584.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 202.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.728, "format_failures": 0.0, "grad_norm": 11.807365417480469, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 663208.0, "reward": 0.1979166716337204, "reward_std": 0.16629423201084137, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.736, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 667040.0, "reward": 1.0, "reward_std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.744, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 676448.0, "reward": 1.0, "reward_std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 86.75, "completions/mean_terminated_length": 86.75, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.752, "format_failures": 0.0, "grad_norm": 25.079971313476562, "learning_rate": 1e-06, "loss": 0.0304, "num_tokens": 682688.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 62.0, "completions/mean_terminated_length": 62.0, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.76, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 687672.0, "reward": 1.0, "reward_std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 82.875, "completions/mean_terminated_length": 82.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.768, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 693840.0, "reward": 1.0, "reward_std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 97.5, "completions/mean_terminated_length": 97.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.776, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 703104.0, "reward": 0.0, "reward_std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 66.75, "completions/mean_terminated_length": 66.75, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.784, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 708256.0, "reward": 1.0, "reward_std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 80.625, "completions/mean_terminated_length": 80.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.792, "format_failures": 0.0, "grad_norm": 39.104312896728516, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 713928.0, "reward": 0.5625, "reward_std": 0.4955156147480011, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 69.0, "completions/mean_terminated_length": 69.0, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.8, "format_failures": 0.0, "grad_norm": 45.443206787109375, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 719272.0, "reward": 0.375, "reward_std": 0.5175491571426392, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 384.25, "completions/mean_terminated_length": 384.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.808, "format_failures": 0.0, "grad_norm": 10.565791130065918, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 737624.0, "reward": 0.45625001192092896, "reward_std": 0.27587342262268066, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 54.875, "completions/mean_terminated_length": 54.875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.816, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 743784.0, "reward": 1.0, "reward_std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 75.75, "completions/mean_terminated_length": 75.75, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.824, "format_failures": 0.0, "grad_norm": 39.79313659667969, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 752688.0, "reward": 0.4166666865348816, "reward_std": 0.34503278136253357, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.832, "format_failures": 0.0, "grad_norm": 16.38981819152832, "learning_rate": 1e-06, "loss": -0.0178, "num_tokens": 761464.0, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.84, "format_failures": 0.0, "grad_norm": 27.306835174560547, "learning_rate": 1e-06, "loss": 0.138, "num_tokens": 768784.0, "reward": 0.8958333730697632, "reward_std": 0.19795581698417664, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 63.375, "completions/mean_terminated_length": 63.375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.848, "format_failures": 0.0, "grad_norm": 4.108266353607178, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 774600.0, "reward": 0.9791666269302368, "reward_std": 0.058925580233335495, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 80.875, "completions/mean_terminated_length": 80.875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.856, "format_failures": 0.0, "grad_norm": 41.572898864746094, "learning_rate": 1e-06, "loss": -0.0168, "num_tokens": 781640.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.864, "format_failures": 0.0, "grad_norm": 23.143457412719727, "learning_rate": 1e-06, "loss": 0.0474, "num_tokens": 793912.0, "reward": 0.25, "reward_std": 0.26726123690605164, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.872, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 800480.0, "reward": 1.0, "reward_std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 66.875, "completions/mean_terminated_length": 66.875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.88, "format_failures": 0.0, "grad_norm": 70.82369232177734, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 806064.0, "reward": 0.4583333730697632, "reward_std": 0.43415671586990356, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 118.75, "completions/mean_terminated_length": 118.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.888, "format_failures": 0.0, "grad_norm": 62.233604431152344, "learning_rate": 1e-06, "loss": 0.1027, "num_tokens": 812768.0, "reward": 0.9270833134651184, "reward_std": 0.2062394618988037, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 89.0, "completions/mean_terminated_length": 89.0, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.896, "format_failures": 0.0, "grad_norm": 312.40289306640625, "learning_rate": 1e-06, "loss": 0.3998, "num_tokens": 820040.0, "reward": 0.9166666865348816, "reward_std": 0.2357022613286972, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.904, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 832072.0, "reward": 1.0, "reward_std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 67.25, "completions/mean_terminated_length": 67.25, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.912, "format_failures": 0.0, "grad_norm": 86.25886535644531, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 838152.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.92, "format_failures": 0.0, "grad_norm": 186.99095153808594, "learning_rate": 1e-06, "loss": 0.1246, "num_tokens": 842544.0, "reward": 0.606249988079071, "reward_std": 0.48287642002105713, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.928, "format_failures": 0.0, "grad_norm": 69.57901763916016, "learning_rate": 1e-06, "loss": -0.0571, "num_tokens": 852376.0, "reward": 0.1666666716337204, "reward_std": 0.35634833574295044, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 75.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.936, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 857592.0, "reward": 1.0, "reward_std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.944, "format_failures": 0.0, "grad_norm": 7.837944030761719, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 865584.0, "reward": 0.9750000238418579, "reward_std": 0.0707106813788414, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 98.0, "completions/mean_terminated_length": 98.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.952, "format_failures": 0.0, "grad_norm": 71.9573745727539, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 871640.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.96, "format_failures": 0.0, "grad_norm": 92.61614227294922, "learning_rate": 1e-06, "loss": -0.0117, "num_tokens": 876496.0, "reward": 0.375, "reward_std": 0.5175491571426392, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.968, "format_failures": 0.0, "grad_norm": 9.656927108764648, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 884040.0, "reward": 0.375, "reward_std": 0.13363061845302582, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.976, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 887704.0, "reward": 1.0, "reward_std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 85.5, "completions/mean_terminated_length": 85.5, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.984, "format_failures": 0.0, "grad_norm": 71.37324523925781, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 892984.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 82.875, "completions/mean_terminated_length": 82.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.992, "format_failures": 0.0, "grad_norm": 35.24643325805664, "learning_rate": 1e-06, "loss": -0.0222, "num_tokens": 899592.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 66.625, "completions/mean_terminated_length": 66.625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 1.0, "format_failures": 0.0, "grad_norm": 34.353172302246094, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 904896.0, "reward": 0.8125, "reward_std": 0.3720118999481201, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 77.25, "completions/mean_terminated_length": 77.25, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.008, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 910368.0, "reward": 1.0, "reward_std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.016, "format_failures": 0.0, "grad_norm": 24.645448684692383, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 913976.0, "reward": 0.4583333432674408, "reward_std": 0.17251639068126678, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 96.0, "completions/mean_terminated_length": 96.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.024, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 922576.0, "reward": 1.0, "reward_std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 80.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.032, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 930840.0, "reward": 0.3333333432674408, "reward_std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 74.25, "completions/mean_terminated_length": 74.25, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.04, "format_failures": 0.0, "grad_norm": 4.1824846267700195, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 935936.0, "reward": 0.9583333730697632, "reward_std": 0.11785111576318741, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 65.25, "completions/mean_terminated_length": 65.25, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.048, "format_failures": 0.0, "grad_norm": 118.19164276123047, "learning_rate": 1e-06, "loss": 0.0455, "num_tokens": 941672.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 1.056, "format_failures": 0.0, "grad_norm": 15.322415351867676, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 951880.0, "reward": 0.75, "reward_std": 0.37796446681022644, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 298.0, "completions/mean_terminated_length": 298.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 1.064, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 959736.0, "reward": 1.0, "reward_std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 64.25, "completions/mean_terminated_length": 64.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.072, "format_failures": 0.0, "grad_norm": 78.67206573486328, "learning_rate": 1e-06, "loss": -0.018, "num_tokens": 965080.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 1.08, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 975344.0, "reward": 0.0, "reward_std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 109.75, "completions/mean_terminated_length": 109.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.088, "format_failures": 0.0, "grad_norm": 143.0574951171875, "learning_rate": 1e-06, "loss": 0.0299, "num_tokens": 981592.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 1.096, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 987976.0, "reward": 1.0, "reward_std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.104, "format_failures": 0.0, "grad_norm": 87.06700134277344, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 992688.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 86.0, "completions/mean_terminated_length": 86.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 1.112, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 997080.0, "reward": 1.0, "reward_std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.12, "format_failures": 0.0, "grad_norm": 8.899555206298828, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 1001736.0, "reward": 0.9583333730697632, "reward_std": 0.11785111576318741, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 75.25, "completions/mean_terminated_length": 75.25, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.1280000000000001, "format_failures": 0.0, "grad_norm": 69.08828735351562, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 1007400.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 88.5, "completions/mean_terminated_length": 88.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.1360000000000001, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1014208.0, "reward": 1.0, "reward_std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 66.5, "completions/mean_terminated_length": 66.5, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.144, "format_failures": 0.0, "grad_norm": 54.22294616699219, "learning_rate": 1e-06, "loss": 0.0352, "num_tokens": 1019120.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 57.375, "completions/mean_terminated_length": 57.375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.152, "format_failures": 0.0, "grad_norm": 24.258403778076172, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 1024992.0, "reward": 0.8333333730697632, "reward_std": 0.17817413806915283, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 58.875, "completions/mean_terminated_length": 58.875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.16, "format_failures": 0.0, "grad_norm": 129.38082885742188, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 1030688.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 1.168, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1037992.0, "reward": 1.0, "reward_std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 1.176, "format_failures": 0.0, "grad_norm": 6.270515441894531, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 1048520.0, "reward": 0.9166666865348816, "reward_std": 0.15430332720279694, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 56.875, "completions/mean_terminated_length": 56.875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.184, "format_failures": 0.0, "grad_norm": 91.06282043457031, "learning_rate": 1e-06, "loss": 0.0317, "num_tokens": 1052912.0, "reward": 0.84375, "reward_std": 0.29693374037742615, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.192, "format_failures": 0.0, "grad_norm": 264.9582214355469, "learning_rate": 1e-06, "loss": 0.0401, "num_tokens": 1062024.0, "reward": 0.8125, "reward_std": 0.3720118999481201, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 1.2, "format_failures": 0.0, "grad_norm": 10.361461639404297, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1066560.0, "reward": 0.9583333730697632, "reward_std": 0.11785111576318741, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.208, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1073392.0, "reward": 1.0, "reward_std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.216, "format_failures": 0.0, "grad_norm": 27.547338485717773, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 1080680.0, "reward": 0.9166666865348816, "reward_std": 0.23570223152637482, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.224, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1084952.0, "reward": 1.0, "reward_std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 64.625, "completions/mean_terminated_length": 64.625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.232, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1090736.0, "reward": 1.0, "reward_std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 515.0, "completions/mean_terminated_length": 515.0, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 1.24, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1108568.0, "reward": 0.0, "reward_std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 80.375, "completions/mean_terminated_length": 80.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.248, "format_failures": 0.0, "grad_norm": 203.04104614257812, "learning_rate": 1e-06, "loss": -0.0084, "num_tokens": 1115256.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 68.125, "completions/mean_terminated_length": 68.125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.256, "format_failures": 0.0, "grad_norm": 101.93464660644531, "learning_rate": 1e-06, "loss": 0.0353, "num_tokens": 1121936.0, "reward": 0.5833333730697632, "reward_std": 0.2357022762298584, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 1.264, "format_failures": 0.0, "grad_norm": 10.695550918579102, "learning_rate": 1e-06, "loss": -0.0137, "num_tokens": 1134552.0, "reward": 0.28125, "reward_std": 0.11732383072376251, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.272, "format_failures": 0.0, "grad_norm": 2130.13232421875, "learning_rate": 1e-06, "loss": 0.2141, "num_tokens": 1142896.0, "reward": 0.5297619104385376, "reward_std": 0.27247974276542664, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 83.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.28, "format_failures": 0.0, "grad_norm": 198.72451782226562, "learning_rate": 1e-06, "loss": -0.0994, "num_tokens": 1149120.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 131.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.288, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1159104.0, "reward": 1.0, "reward_std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 71.125, "completions/mean_terminated_length": 71.125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 1.296, "format_failures": 0.0, "grad_norm": 202.88536071777344, "learning_rate": 1e-06, "loss": -0.0248, "num_tokens": 1166416.0, "reward": 0.2083333432674408, "reward_std": 0.39591163396835327, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 67.875, "completions/mean_terminated_length": 67.875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.304, "format_failures": 0.0, "grad_norm": 115.8452377319336, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 1172680.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 97.0, "completions/mean_terminated_length": 97.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.312, "format_failures": 0.0, "grad_norm": 52.33538818359375, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 1181464.0, "reward": 0.7916666865348816, "reward_std": 0.2920915186405182, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 247.0, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 1.32, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1190616.0, "reward": 0.6666666865348816, "reward_std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 91.25, "completions/mean_terminated_length": 91.25, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.328, "format_failures": 0.0, "grad_norm": 53.28190994262695, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 1196952.0, "reward": 0.875, "reward_std": 0.13363061845302582, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.336, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1202912.0, "reward": 1.0, "reward_std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.3439999999999999, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1207096.0, "reward": 1.0, "reward_std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 71.25, "completions/mean_terminated_length": 71.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.3519999999999999, "format_failures": 0.0, "grad_norm": 141.00119018554688, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 1214136.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 1.3599999999999999, "format_failures": 0.0, "grad_norm": 11.8909912109375, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 1223712.0, "reward": 0.75, "reward_std": 0.15430334210395813, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 1.3679999999999999, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1232272.0, "reward": 1.0, "reward_std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 61.375, "completions/mean_terminated_length": 61.375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.376, "format_failures": 0.0, "grad_norm": 131.6269989013672, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 1236760.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 72.5, "completions/mean_terminated_length": 72.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 1.384, "format_failures": 0.0, "grad_norm": 482.1353454589844, "learning_rate": 1e-06, "loss": -0.1397, "num_tokens": 1244576.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.875, "completions/mean_terminated_length": 52.875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 1.392, "format_failures": 0.0, "grad_norm": 186.35694885253906, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 1249680.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 1.4, "format_failures": 0.0, "grad_norm": 23.51119041442871, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 1258968.0, "reward": 0.875, "reward_std": 0.2314550280570984, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.625, "completions/mean_terminated_length": 55.625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.408, "format_failures": 0.0, "grad_norm": 188.21485900878906, "learning_rate": 1e-06, "loss": -0.0275, "num_tokens": 1263544.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 39.375, "completions/mean_terminated_length": 39.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.416, "format_failures": 0.0, "grad_norm": 119.91997528076172, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 1267056.0, "reward": 0.7916666865348816, "reward_std": 0.3535533845424652, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 122.25, "completions/mean_terminated_length": 122.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 1.424, "format_failures": 0.0, "grad_norm": 50.41508102416992, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 1274304.0, "reward": 0.125, "reward_std": 0.3535533845424652, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 57.75, "completions/mean_terminated_length": 57.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.432, "format_failures": 0.0, "grad_norm": 206.82757568359375, "learning_rate": 1e-06, "loss": -0.0398, "num_tokens": 1281808.0, "reward": 0.4375, "reward_std": 0.4955156147480011, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 169.0, "completions/mean_terminated_length": 169.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 1.44, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1288368.0, "reward": 1.0, "reward_std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 110.75, "completions/mean_terminated_length": 110.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.448, "format_failures": 0.0, "grad_norm": 235.8719482421875, "learning_rate": 1e-06, "loss": 0.044, "num_tokens": 1293984.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 70.0, "completions/mean_terminated_length": 70.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.456, "format_failures": 0.0, "grad_norm": 84.64993286132812, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 1298592.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 92.625, "completions/mean_terminated_length": 92.625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.464, "format_failures": 0.0, "grad_norm": 148.4041748046875, "learning_rate": 1e-06, "loss": -0.013, "num_tokens": 1306392.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 282.0, "completions/mean_terminated_length": 282.0, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 1.472, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1316248.0, "reward": 1.0, "reward_std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 1.48, "format_failures": 0.0, "grad_norm": 16.818960189819336, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 1327512.0, "reward": 0.75, "reward_std": 0.15430334210395813, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.488, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1331992.0, "reward": 1.0, "reward_std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 61.25, "completions/mean_terminated_length": 61.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.496, "format_failures": 0.0, "grad_norm": 396.1182556152344, "learning_rate": 1e-06, "loss": -0.051, "num_tokens": 1337024.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 58.875, "completions/mean_terminated_length": 58.875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.504, "format_failures": 0.0, "grad_norm": 224.00503540039062, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 1342288.0, "reward": 0.6270833611488342, "reward_std": 0.3330877125263214, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 76.375, "completions/mean_terminated_length": 76.375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.512, "format_failures": 0.0, "grad_norm": 141.5522003173828, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 1348376.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 70.0, "completions/mean_terminated_length": 70.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.52, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1355000.0, "reward": 1.0, "reward_std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.528, "format_failures": 0.0, "grad_norm": 82.11388397216797, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 1359216.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 73.125, "completions/mean_terminated_length": 73.125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.536, "format_failures": 0.0, "grad_norm": 136.23193359375, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 1365768.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 81.0, "completions/mean_terminated_length": 81.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.544, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1370432.0, "reward": 1.0, "reward_std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 70.125, "completions/mean_terminated_length": 70.125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.552, "format_failures": 0.0, "grad_norm": 347.42242431640625, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 1375928.0, "reward": 0.4583333432674408, "reward_std": 0.501980185508728, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.56, "format_failures": 0.0, "grad_norm": 173.91111755371094, "learning_rate": 1e-06, "loss": -0.0551, "num_tokens": 1380904.0, "reward": 0.625, "reward_std": 0.5175491571426392, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.568, "format_failures": 0.0, "grad_norm": 107.2835922241211, "learning_rate": 1e-06, "loss": -0.0125, "num_tokens": 1385592.0, "reward": 0.625, "reward_std": 0.5175491571426392, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 1.576, "format_failures": 0.0, "grad_norm": 269.141845703125, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 1393424.0, "reward": 0.3333333432674408, "reward_std": 0.35634833574295044, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 72.0, "completions/mean_terminated_length": 72.0, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.584, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1400616.0, "reward": 1.0, "reward_std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 1.592, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1409152.0, "reward": 1.0, "reward_std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 78.0, "completions/mean_terminated_length": 78.0, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.6, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1416256.0, "reward": 1.0, "reward_std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 1.608, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1424312.0, "reward": 1.0, "reward_std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.616, "format_failures": 0.0, "grad_norm": 43.83434295654297, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 1427952.0, "reward": 0.75, "reward_std": 0.15430334210395813, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 1.624, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1436592.0, "reward": 1.0, "reward_std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 1.6320000000000001, "format_failures": 0.0, "grad_norm": 26.562877655029297, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 1443280.0, "reward": 0.02083333395421505, "reward_std": 0.0589255690574646, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 97.875, "completions/mean_terminated_length": 97.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 1.6400000000000001, "format_failures": 0.0, "grad_norm": 677.6515502929688, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 1448976.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 1.6480000000000001, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1453976.0, "reward": 1.0, "reward_std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 113.5, "completions/mean_terminated_length": 113.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 1.6560000000000001, "format_failures": 0.0, "grad_norm": 152.89486694335938, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 1460648.0, "reward": 0.59375, "reward_std": 0.4212544858455658, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.125, "completions/mean_terminated_length": 53.125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.6640000000000001, "format_failures": 0.0, "grad_norm": 295.5757141113281, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 1466768.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 1.6720000000000002, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1476904.0, "reward": 1.0, "reward_std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 1.6800000000000002, "format_failures": 0.0, "grad_norm": 32.80705261230469, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 1487088.0, "reward": 0.90625, "reward_std": 0.1293872892856598, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 376.0, "completions/mean_terminated_length": 376.0, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 1.688, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1498280.0, "reward": 1.0, "reward_std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 97.0, "completions/mean_terminated_length": 97.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 1.696, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1504248.0, "reward": 1.0, "reward_std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 63.0, "completions/mean_terminated_length": 63.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.704, "format_failures": 0.0, "grad_norm": 270.9079284667969, "learning_rate": 1e-06, "loss": -0.0503, "num_tokens": 1510608.0, "reward": 0.6666666269302368, "reward_std": 0.4714045226573944, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 186.0, "completions/mean_terminated_length": 186.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 1.712, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1518456.0, "reward": 1.0, "reward_std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.72, "format_failures": 0.0, "grad_norm": 197.4288787841797, "learning_rate": 1e-06, "loss": -0.0422, "num_tokens": 1522640.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 76.25, "completions/mean_terminated_length": 76.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.728, "format_failures": 0.0, "grad_norm": 129.10081481933594, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 1528992.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 1.736, "format_failures": 0.0, "grad_norm": 248.12551879882812, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 1536968.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 1.744, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1542264.0, "reward": 1.0, "reward_std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 1.752, "format_failures": 0.0, "grad_norm": 114.07322692871094, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 1549040.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 75.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.76, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1556216.0, "reward": 1.0, "reward_std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 66.0, "completions/mean_terminated_length": 66.0, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.768, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1561496.0, "reward": 0.0, "reward_std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.776, "format_failures": 0.0, "grad_norm": 491.00555419921875, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 1566856.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 64.75, "completions/mean_terminated_length": 64.75, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.784, "format_failures": 0.0, "grad_norm": 144.65443420410156, "learning_rate": 1e-06, "loss": -0.0261, "num_tokens": 1571752.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 405.875, "completions/mean_terminated_length": 405.875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 1.792, "format_failures": 0.0, "grad_norm": 23.559663772583008, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 1585672.0, "reward": 0.6145833730697632, "reward_std": 0.1473139226436615, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 109.0, "completions/mean_terminated_length": 109.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.8, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1592656.0, "reward": 1.0, "reward_std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.375, "completions/mean_terminated_length": 51.375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.808, "format_failures": 0.0, "grad_norm": 308.489501953125, "learning_rate": 1e-06, "loss": -0.0222, "num_tokens": 1597336.0, "reward": 0.625, "reward_std": 0.5175491571426392, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 45.0, "completions/mean_terminated_length": 45.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.8159999999999998, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1601312.0, "reward": 1.0, "reward_std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 71.0, "completions/mean_terminated_length": 71.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.8239999999999998, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1605888.0, "reward": 1.0, "reward_std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 39.0, "completions/mean_terminated_length": 39.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 1.8319999999999999, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1609640.0, "reward": 1.0, "reward_std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 75.625, "completions/mean_terminated_length": 75.625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.8399999999999999, "format_failures": 0.0, "grad_norm": 718.7521362304688, "learning_rate": 1e-06, "loss": 0.0419, "num_tokens": 1615608.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.8479999999999999, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1623928.0, "reward": 1.0, "reward_std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.8559999999999999, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1627912.0, "reward": 1.0, "reward_std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.8639999999999999, "format_failures": 0.0, "grad_norm": 227.06643676757812, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 1638336.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.8719999999999999, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1642488.0, "reward": 1.0, "reward_std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 1.88, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1647224.0, "reward": 1.0, "reward_std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 64.875, "completions/mean_terminated_length": 64.875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.888, "format_failures": 0.0, "grad_norm": 951.9050903320312, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 1652352.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 79.125, "completions/mean_terminated_length": 79.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 1.896, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1658696.0, "reward": 1.0, "reward_std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 61.0, "completions/mean_terminated_length": 61.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 1.904, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1663816.0, "reward": 1.0, "reward_std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 46.625, "completions/mean_terminated_length": 46.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.912, "format_failures": 0.0, "grad_norm": 115.91224670410156, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 1668584.0, "reward": 0.9375, "reward_std": 0.1157275140285492, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 1.92, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1672584.0, "reward": 1.0, "reward_std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.928, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1679888.0, "reward": 1.0, "reward_std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 89.25, "completions/mean_terminated_length": 89.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.936, "format_failures": 0.0, "grad_norm": 339.7657165527344, "learning_rate": 1e-06, "loss": 0.0711, "num_tokens": 1686832.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 40.5, "completions/mean_terminated_length": 40.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 1.944, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1691976.0, "reward": 1.0, "reward_std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 80.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.952, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1697720.0, "reward": 1.0, "reward_std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 94.0, "completions/mean_terminated_length": 94.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.96, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1705904.0, "reward": 1.0, "reward_std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.968, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1711016.0, "reward": 1.0, "reward_std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 63.625, "completions/mean_terminated_length": 63.625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 1.976, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1715416.0, "reward": 0.0, "reward_std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 98.0, "completions/mean_terminated_length": 98.0, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.984, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1722528.0, "reward": 1.0, "reward_std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 1.992, "format_failures": 0.0, "grad_norm": 21.67804527282715, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 1731280.0, "reward": 0.5416666865348816, "reward_std": 0.17251640558242798, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 88.25, "completions/mean_terminated_length": 88.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.0, "format_failures": 0.0, "grad_norm": 100.28002166748047, "learning_rate": 1e-06, "loss": -0.0133, "num_tokens": 1737216.0, "reward": 0.8125, "reward_std": 0.2587745785713196, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 84.25, "completions/mean_terminated_length": 84.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 2.008, "format_failures": 0.0, "grad_norm": 186.31800842285156, "learning_rate": 1e-06, "loss": -0.0159, "num_tokens": 1742744.0, "reward": 0.625, "reward_std": 0.5175491571426392, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 62.125, "completions/mean_terminated_length": 62.125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.016, "format_failures": 0.0, "grad_norm": 132.25576782226562, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 1749288.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 2.024, "format_failures": 0.0, "grad_norm": 70.13836669921875, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 1756632.0, "reward": 0.9583333730697632, "reward_std": 0.11785111576318741, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 107.0, "completions/mean_terminated_length": 107.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 2.032, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1764552.0, "reward": 1.0, "reward_std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 83.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 2.04, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1769088.0, "reward": 1.0, "reward_std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 345.0, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 2.048, "format_failures": 0.0, "grad_norm": 118.46062469482422, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 1778048.0, "reward": 0.1875, "reward_std": 0.2587745785713196, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 2.056, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1784512.0, "reward": 1.0, "reward_std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 2.064, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1793200.0, "reward": 1.0, "reward_std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 93.625, "completions/mean_terminated_length": 93.625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 2.072, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1799856.0, "reward": 0.0, "reward_std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 68.0, "completions/mean_terminated_length": 68.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.08, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1808984.0, "reward": 1.0, "reward_std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 158.875, "completions/mean_terminated_length": 158.875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 2.088, "format_failures": 0.0, "grad_norm": 11.88393497467041, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 1816016.0, "reward": 0.9166666865348816, "reward_std": 0.15430332720279694, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 65.625, "completions/mean_terminated_length": 65.625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.096, "format_failures": 0.0, "grad_norm": 352.6447448730469, "learning_rate": 1e-06, "loss": -0.0157, "num_tokens": 1821560.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 131.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 2.104, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1829048.0, "reward": 1.0, "reward_std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 2.112, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1835760.0, "reward": 1.0, "reward_std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 80.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.12, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1842688.0, "reward": 1.0, "reward_std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 66.0, "completions/mean_terminated_length": 66.0, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.128, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1847904.0, "reward": 1.0, "reward_std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 2.136, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1857344.0, "reward": 1.0, "reward_std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 78.0, "completions/mean_terminated_length": 78.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.144, "format_failures": 0.0, "grad_norm": 201.63685607910156, "learning_rate": 1e-06, "loss": -0.0385, "num_tokens": 1863640.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 87.0, "completions/mean_terminated_length": 87.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.152, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1870840.0, "reward": 1.0, "reward_std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 2.16, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1880344.0, "reward": 0.6666666865348816, "reward_std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.125, "completions/mean_terminated_length": 51.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.168, "format_failures": 0.0, "grad_norm": 120.31925964355469, "learning_rate": 1e-06, "loss": -0.0422, "num_tokens": 1883960.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 181.375, "completions/mean_terminated_length": 181.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 2.176, "format_failures": 0.0, "grad_norm": 15.985126495361328, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 1894024.0, "reward": 0.7083333730697632, "reward_std": 0.117851123213768, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.184, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1901112.0, "reward": 1.0, "reward_std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 74.0, "completions/mean_terminated_length": 74.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.192, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1908024.0, "reward": 1.0, "reward_std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 55.375, "completions/mean_terminated_length": 55.375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.2, "format_failures": 0.0, "grad_norm": 113.7641372680664, "learning_rate": 1e-06, "loss": -0.0085, "num_tokens": 1913152.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 90.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.208, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1920520.0, "reward": 0.3333333432674408, "reward_std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 85.375, "completions/mean_terminated_length": 85.375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.216, "format_failures": 0.0, "grad_norm": 150.4706268310547, "learning_rate": 1e-06, "loss": 0.0777, "num_tokens": 1926800.0, "reward": 0.5833333730697632, "reward_std": 0.2357022762298584, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 106.0, "completions/mean_terminated_length": 106.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.224, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1932792.0, "reward": 0.6666666865348816, "reward_std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 2.232, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1943640.0, "reward": 0.0, "reward_std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.24, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1947504.0, "reward": 1.0, "reward_std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 69.875, "completions/mean_terminated_length": 69.875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.248, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1954240.0, "reward": 1.0, "reward_std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 101.625, "completions/mean_terminated_length": 101.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.2560000000000002, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1959840.0, "reward": 1.0, "reward_std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 2.2640000000000002, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1968984.0, "reward": 1.0, "reward_std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 81.5, "completions/mean_terminated_length": 81.5, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 2.2720000000000002, "format_failures": 0.0, "grad_norm": 167.05389404296875, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 1972880.0, "reward": 0.8333333730697632, "reward_std": 0.17817415297031403, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 106.0, "completions/mean_terminated_length": 106.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.2800000000000002, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1978408.0, "reward": 1.0, "reward_std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 86.5, "completions/mean_terminated_length": 86.5, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.288, "format_failures": 0.0, "grad_norm": 16.239858627319336, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 1985768.0, "reward": 0.875, "reward_std": 0.2314550280570984, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 62.0, "completions/mean_terminated_length": 62.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.296, "format_failures": 0.0, "grad_norm": 157.4322052001953, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 1990608.0, "reward": 0.8125, "reward_std": 0.3720118999481201, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 90.0, "completions/mean_terminated_length": 90.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.304, "format_failures": 0.0, "grad_norm": 266.88690185546875, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1996832.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.312, "format_failures": 0.0, "grad_norm": 103.26960754394531, "learning_rate": 1e-06, "loss": -0.0203, "num_tokens": 2000840.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 36.125, "completions/mean_terminated_length": 36.125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 2.32, "format_failures": 0.0, "grad_norm": 65.8975830078125, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 2004584.0, "reward": 0.9583333730697632, "reward_std": 0.11785111576318741, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 86.0, "completions/mean_terminated_length": 86.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.328, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2010664.0, "reward": 0.5, "reward_std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 64.625, "completions/mean_terminated_length": 64.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.336, "format_failures": 0.0, "grad_norm": 330.7345275878906, "learning_rate": 1e-06, "loss": 0.0537, "num_tokens": 2014920.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 74.0, "completions/mean_terminated_length": 74.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.344, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2019968.0, "reward": 1.0, "reward_std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 100.0, "completions/mean_terminated_length": 100.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.352, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2026400.0, "reward": 1.0, "reward_std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 64.875, "completions/mean_terminated_length": 64.875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.36, "format_failures": 0.0, "grad_norm": 133.32974243164062, "learning_rate": 1e-06, "loss": -0.0152, "num_tokens": 2031552.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 77.5, "completions/mean_terminated_length": 77.5, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.368, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2036368.0, "reward": 1.0, "reward_std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 86.375, "completions/mean_terminated_length": 86.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.376, "format_failures": 0.0, "grad_norm": 106.36997985839844, "learning_rate": 1e-06, "loss": -0.0165, "num_tokens": 2042128.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 2.384, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2049360.0, "reward": 0.6666666865348816, "reward_std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 89.75, "completions/mean_terminated_length": 89.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 2.392, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2055264.0, "reward": 1.0, "reward_std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.4, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2058360.0, "reward": 1.0, "reward_std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 2.408, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2064072.0, "reward": 1.0, "reward_std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 70.875, "completions/mean_terminated_length": 70.875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.416, "format_failures": 0.0, "grad_norm": 114.08187866210938, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 2069520.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 2.424, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2076744.0, "reward": 1.0, "reward_std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 100.0, "completions/mean_terminated_length": 100.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.432, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2082088.0, "reward": 1.0, "reward_std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 90.375, "completions/mean_terminated_length": 90.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.44, "format_failures": 0.0, "grad_norm": 22.607831954956055, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 2086664.0, "reward": 0.9583333730697632, "reward_std": 0.11785111576318741, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 2.448, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2096072.0, "reward": 1.0, "reward_std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.456, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2100160.0, "reward": 1.0, "reward_std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 70.0, "completions/mean_terminated_length": 70.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 2.464, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2106104.0, "reward": 1.0, "reward_std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 2.472, "format_failures": 0.0, "grad_norm": 243.66343688964844, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2111296.0, "reward": 0.625, "reward_std": 0.5175491571426392, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 65.75, "completions/mean_terminated_length": 65.75, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.48, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2116136.0, "reward": 1.0, "reward_std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 2.488, "format_failures": 0.0, "grad_norm": 10.465584754943848, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2124384.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 48.875, "completions/mean_terminated_length": 48.875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.496, "format_failures": 0.0, "grad_norm": 218.04502868652344, "learning_rate": 1e-06, "loss": -0.0173, "num_tokens": 2129952.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 120.5, "completions/mean_terminated_length": 120.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.504, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2138664.0, "reward": 0.0, "reward_std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.512, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2144832.0, "reward": 1.0, "reward_std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 88.0, "completions/mean_terminated_length": 88.0, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.52, "format_failures": 0.0, "grad_norm": 510.1783752441406, "learning_rate": 1e-06, "loss": 0.0994, "num_tokens": 2151384.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 106.25, "completions/mean_terminated_length": 106.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.528, "format_failures": 0.0, "grad_norm": 84.95390319824219, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 2157976.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 111.375, "completions/mean_terminated_length": 111.375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.536, "format_failures": 0.0, "grad_norm": 79.69486999511719, "learning_rate": 1e-06, "loss": -0.0186, "num_tokens": 2169472.0, "reward": 0.21875, "reward_std": 0.2086307406425476, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 2.544, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2175472.0, "reward": 1.0, "reward_std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 74.0, "completions/mean_terminated_length": 74.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.552, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2182976.0, "reward": 1.0, "reward_std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 83.0, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.56, "format_failures": 0.0, "grad_norm": 474.4319763183594, "learning_rate": 1e-06, "loss": 0.0879, "num_tokens": 2195608.0, "reward": 0.4583333432674408, "reward_std": 0.11785111576318741, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 287.0, "completions/mean_terminated_length": 287.0, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 2.568, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2205680.0, "reward": 0.6666666865348816, "reward_std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.576, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2214168.0, "reward": 0.0, "reward_std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 2.584, "format_failures": 0.0, "grad_norm": 97.7177963256836, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 2222368.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 77.625, "completions/mean_terminated_length": 77.625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 2.592, "format_failures": 0.0, "grad_norm": 422.36474609375, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 2228728.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 79.0, "completions/mean_terminated_length": 79.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 2.6, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2233408.0, "reward": 1.0, "reward_std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 67.0, "completions/mean_terminated_length": 67.0, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.608, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2240216.0, "reward": 1.0, "reward_std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 59.0, "completions/mean_terminated_length": 59.0, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.616, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2245712.0, "reward": 1.0, "reward_std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 66.875, "completions/mean_terminated_length": 66.875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 2.624, "format_failures": 0.0, "grad_norm": 130.1912841796875, "learning_rate": 1e-06, "loss": -0.0147, "num_tokens": 2250856.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 93.25, "completions/mean_terminated_length": 93.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.632, "format_failures": 0.0, "grad_norm": 258.1058654785156, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 2257512.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 82.125, "completions/mean_terminated_length": 82.125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.64, "format_failures": 0.0, "grad_norm": 161.73495483398438, "learning_rate": 1e-06, "loss": -0.0447, "num_tokens": 2262456.0, "reward": 0.625, "reward_std": 0.5175491571426392, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 2.648, "format_failures": 0.0, "grad_norm": 331.5845642089844, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 2269240.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 2.656, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2277912.0, "reward": 1.0, "reward_std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 84.0, "completions/mean_terminated_length": 84.0, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.664, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2283328.0, "reward": 1.0, "reward_std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 515.0, "completions/mean_terminated_length": 515.0, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 2.672, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2296568.0, "reward": 0.5, "reward_std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 114.0, "completions/mean_terminated_length": 114.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 2.68, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2304424.0, "reward": 1.0, "reward_std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 70.5, "completions/mean_terminated_length": 70.5, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.6879999999999997, "format_failures": 0.0, "grad_norm": 184.2093505859375, "learning_rate": 1e-06, "loss": -0.0062, "num_tokens": 2310648.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 73.375, "completions/mean_terminated_length": 73.375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.6959999999999997, "format_failures": 0.0, "grad_norm": 489.6385192871094, "learning_rate": 1e-06, "loss": 0.2072, "num_tokens": 2316936.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 46.375, "completions/mean_terminated_length": 46.375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 2.7039999999999997, "format_failures": 0.0, "grad_norm": 198.34613037109375, "learning_rate": 1e-06, "loss": -0.0249, "num_tokens": 2322008.0, "reward": 0.25, "reward_std": 0.4629100561141968, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 2.7119999999999997, "format_failures": 0.0, "grad_norm": 181.61871337890625, "learning_rate": 1e-06, "loss": -0.0182, "num_tokens": 2331600.0, "reward": 0.1875, "reward_std": 0.20773723721504211, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 58.625, "completions/mean_terminated_length": 58.625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.7199999999999998, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2339504.0, "reward": 1.0, "reward_std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 98.25, "completions/mean_terminated_length": 98.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.7279999999999998, "format_failures": 1.0, "grad_norm": 1064.173095703125, "learning_rate": 1e-06, "loss": -0.1256, "num_tokens": 2347176.0, "reward": 0.125, "reward_std": 0.46929532289505005, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.7359999999999998, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2352312.0, "reward": 1.0, "reward_std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 116.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 2.7439999999999998, "format_failures": 0.0, "grad_norm": 30.95206642150879, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 2360112.0, "reward": 0.4166666865348816, "reward_std": 0.15430335700511932, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 74.375, "completions/mean_terminated_length": 74.375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 2.752, "format_failures": 0.0, "grad_norm": 67.86247253417969, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 2367848.0, "reward": 0.4375, "reward_std": 0.1767766922712326, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 63.0, "completions/mean_terminated_length": 63.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 2.76, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2374208.0, "reward": 1.0, "reward_std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 103.0, "completions/mean_terminated_length": 103.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 2.768, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2380112.0, "reward": 1.0, "reward_std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 93.25, "completions/mean_terminated_length": 93.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.776, "format_failures": 0.0, "grad_norm": 67.7156982421875, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 2385912.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 69.0, "completions/mean_terminated_length": 69.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 2.784, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2390344.0, "reward": 1.0, "reward_std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 2.792, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2397592.0, "reward": 1.0, "reward_std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 80.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 2.8, "format_failures": 1.0, "grad_norm": 1507.3477783203125, "learning_rate": 1e-06, "loss": -0.2609, "num_tokens": 2404136.0, "reward": 0.625, "reward_std": 0.7440237998962402, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 62.875, "completions/mean_terminated_length": 62.875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.808, "format_failures": 0.0, "grad_norm": 115.66720581054688, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 2408976.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 2.816, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2415184.0, "reward": 1.0, "reward_std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.824, "format_failures": 0.0, "grad_norm": 313.3124694824219, "learning_rate": 1e-06, "loss": -0.036, "num_tokens": 2429912.0, "reward": 0.19583332538604736, "reward_std": 0.27970364689826965, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 71.0, "completions/mean_terminated_length": 71.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 2.832, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2434856.0, "reward": 1.0, "reward_std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.84, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2441408.0, "reward": 1.0, "reward_std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 74.0, "completions/mean_terminated_length": 74.0, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.848, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2445848.0, "reward": 0.0, "reward_std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 2.856, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2450056.0, "reward": 1.0, "reward_std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 2.864, "format_failures": 0.0, "grad_norm": 82.1136703491211, "learning_rate": 1e-06, "loss": -0.0297, "num_tokens": 2470488.0, "reward": 0.1875, "reward_std": 0.2587745785713196, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 88.75, "completions/mean_terminated_length": 88.75, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.872, "format_failures": 0.0, "grad_norm": 38.4820442199707, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 2478016.0, "reward": 0.9375, "reward_std": 0.1767766922712326, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.88, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2482624.0, "reward": 1.0, "reward_std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 107.375, "completions/mean_terminated_length": 107.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.888, "format_failures": 0.0, "grad_norm": 1100.262451171875, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 2489416.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 117.625, "completions/mean_terminated_length": 117.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 2.896, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2496128.0, "reward": 1.0, "reward_std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 2.904, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2505360.0, "reward": 0.6666666865348816, "reward_std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.912, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2511072.0, "reward": 0.0, "reward_std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 62.0, "completions/mean_terminated_length": 62.0, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.92, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2516600.0, "reward": 1.0, "reward_std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.928, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2520984.0, "reward": 1.0, "reward_std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.5, "completions/mean_terminated_length": 38.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 2.936, "format_failures": 0.0, "grad_norm": 18.036239624023438, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 2524616.0, "reward": 0.875, "reward_std": 0.17251639068126678, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 103.625, "completions/mean_terminated_length": 103.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.944, "format_failures": 0.0, "grad_norm": 722.7540893554688, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 2530808.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 42.0, "completions/mean_terminated_length": 42.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.952, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2534736.0, "reward": 1.0, "reward_std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 200.875, "completions/mean_terminated_length": 200.875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 2.96, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2545888.0, "reward": 1.0, "reward_std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 2.968, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2550816.0, "reward": 1.0, "reward_std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.976, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2555296.0, "reward": 1.0, "reward_std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 76.0, "completions/mean_terminated_length": 76.0, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.984, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2560304.0, "reward": 1.0, "reward_std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 92.25, "completions/mean_terminated_length": 92.25, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.992, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2566528.0, "reward": 1.0, "reward_std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 100.375, "completions/mean_terminated_length": 100.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 3.0, "format_failures": 0.0, "grad_norm": 60.566749572753906, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 2572368.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 3.008, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2577112.0, "reward": 1.0, "reward_std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 70.0, "completions/mean_terminated_length": 70.0, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 3.016, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2581984.0, "reward": 1.0, "reward_std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 93.5, "completions/mean_terminated_length": 93.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 3.024, "format_failures": 0.0, "grad_norm": 71.45764923095703, "learning_rate": 1e-06, "loss": -0.113, "num_tokens": 2589344.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 3.032, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2599320.0, "reward": 0.0, "reward_std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 80.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 3.04, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2604168.0, "reward": 1.0, "reward_std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 516.0, "completions/mean_terminated_length": 516.0, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 3.048, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2619552.0, "reward": 1.0, "reward_std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 134.0, "completions/mean_terminated_length": 134.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 3.056, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2626696.0, "reward": 1.0, "reward_std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.625, "completions/mean_terminated_length": 53.625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 3.064, "format_failures": 1.0, "grad_norm": 197.88946533203125, "learning_rate": 1e-06, "loss": -0.1955, "num_tokens": 2633296.0, "reward": 0.625, "reward_std": 0.7440237998962402, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 68.0, "completions/mean_terminated_length": 68.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 3.072, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2637568.0, "reward": 1.0, "reward_std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 84.5, "completions/mean_terminated_length": 84.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 3.08, "format_failures": 1.0, "grad_norm": 127.36408233642578, "learning_rate": 1e-06, "loss": -0.2981, "num_tokens": 2643168.0, "reward": 0.375, "reward_std": 0.7440237998962402, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 155.0, "completions/mean_terminated_length": 155.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 3.088, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2651856.0, "reward": 1.0, "reward_std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 97.0, "completions/mean_terminated_length": 97.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 3.096, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2658184.0, "reward": 1.0, "reward_std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 323.0, "completions/mean_terminated_length": 323.0, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 3.104, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2667504.0, "reward": 1.0, "reward_std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 3.112, "format_failures": 0.0, "grad_norm": 26.941102981567383, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 2676184.0, "reward": 0.875, "reward_std": 0.17251639068126678, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 85.0, "completions/mean_terminated_length": 85.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 3.12, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2681048.0, "reward": 0.0, "reward_std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.128, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2685000.0, "reward": 0.0, "reward_std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 3.136, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2691152.0, "reward": 1.0, "reward_std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 95.5, "completions/mean_terminated_length": 95.5, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 3.144, "format_failures": 0.0, "grad_norm": 910.4259033203125, "learning_rate": 1e-06, "loss": 0.1325, "num_tokens": 2699248.0, "reward": 0.8125, "reward_std": 0.3471825420856476, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 332.0, "completions/mean_terminated_length": 332.0, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 3.152, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2710168.0, "reward": 0.0, "reward_std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 3.16, "format_failures": 1.0, "grad_norm": 1046.15966796875, "learning_rate": 1e-06, "loss": -0.1875, "num_tokens": 2715080.0, "reward": 0.625, "reward_std": 0.7440237998962402, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 3.168, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2723056.0, "reward": 1.0, "reward_std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 46.375, "completions/mean_terminated_length": 46.375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 3.176, "format_failures": 0.0, "grad_norm": 159.84628295898438, "learning_rate": 1e-06, "loss": -0.0101, "num_tokens": 2727800.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 3.184, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2739888.0, "reward": 1.0, "reward_std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 3.192, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2746520.0, "reward": 1.0, "reward_std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 104.0, "completions/mean_terminated_length": 104.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 3.2, "format_failures": 0.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2753088.0, "reward": 1.0, "reward_std": 0.0, "step": 400 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 2753088, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }