{"loss": 1e-08, "grad_norm": 2.0850979, "learning_rate": 1e-08, "reward": 1.29589844, "reward_std": 0.51460886, "frac_reward_zero_std": 0.25, "rewards/MazeReward/mean": 0.0171875, "rewards/MazeReward/std": 0.0562746, "rewards/MazeFormat/mean": 0.8828125, "rewards/MazeFormat/std": 0.32290742, "rewards/Format/mean": 0.24121094, "rewards/Format/std": 0.04487404, "completions/mean_length": 147.515625, "completions/min_length": 43.0, "completions/max_length": 711.0, "completions/clipped_ratio": 0.0, "kl": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00854701, "global_step/max_steps": "1/2000", "percentage": "0.05%", "elapsed_time": "51s", "remaining_time": "1d 4h 23m 18s", "memory(GiB)": 35.45, "train_speed(iter/s)": 0.01956} {"loss": 2.261e-05, "grad_norm": 1.60396738, "learning_rate": 5e-08, "reward": 1.21606445, "reward_std": 0.44087144, "frac_reward_zero_std": 0.296875, "rewards/MazeReward/mean": 0.00976563, "rewards/MazeReward/std": 0.04561943, "rewards/MazeFormat/mean": 0.88671875, "rewards/MazeFormat/std": 0.31641269, "rewards/Format/mean": 0.23168945, "rewards/Format/std": 0.0638391, "completions/mean_length": 140.15625, "completions/min_length": 45.75, "completions/max_length": 525.0, "completions/clipped_ratio": 0.0, "kl": 0.00056486, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.04273504, "global_step/max_steps": "5/2000", "percentage": "0.25%", "elapsed_time": "3m 22s", "remaining_time": "22h 24m 0s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.024739} {"loss": 3.144e-05, "grad_norm": 1.47383502, "learning_rate": 1e-07, "reward": 1.26035156, "reward_std": 0.44822132, "frac_reward_zero_std": 0.35, "rewards/MazeReward/mean": 0.011875, "rewards/MazeReward/std": 0.05252101, "rewards/MazeFormat/mean": 0.9046875, "rewards/MazeFormat/std": 0.2910091, "rewards/Format/mean": 0.23691406, "rewards/Format/std": 0.05474026, "completions/mean_length": 145.815625, "completions/min_length": 39.0, "completions/max_length": 646.6, "completions/clipped_ratio": 0.0, "kl": 0.00078533, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08547009, "global_step/max_steps": "10/2000", "percentage": "0.50%", "elapsed_time": "6m 45s", "remaining_time": "22h 24m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.024674} {"loss": 2.953e-05, "grad_norm": 2.05830401, "learning_rate": 1.5e-07, "reward": 1.23320312, "reward_std": 0.39584332, "frac_reward_zero_std": 0.275, "rewards/MazeReward/mean": 0.01, "rewards/MazeReward/std": 0.04012826, "rewards/MazeFormat/mean": 0.9, "rewards/MazeFormat/std": 0.29746217, "rewards/Format/mean": 0.23320313, "rewards/Format/std": 0.06156868, "completions/mean_length": 148.54375, "completions/min_length": 42.6, "completions/max_length": 664.4, "completions/clipped_ratio": 0.0, "kl": 0.00073749, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.12820513, "global_step/max_steps": "15/2000", "percentage": "0.75%", "elapsed_time": "10m 10s", "remaining_time": "22h 26m 8s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.024576} {"loss": 3.377e-05, "grad_norm": 2.07747156, "learning_rate": 2e-07, "reward": 1.28867187, "reward_std": 0.48192683, "frac_reward_zero_std": 0.275, "rewards/MazeReward/mean": 0.0159375, "rewards/MazeReward/std": 0.05514752, "rewards/MazeFormat/mean": 0.8953125, "rewards/MazeFormat/std": 0.30488806, "rewards/Format/mean": 0.23398437, "rewards/Format/std": 0.05965922, "completions/mean_length": 141.3703125, "completions/min_length": 47.0, "completions/max_length": 640.8, "completions/clipped_ratio": 0.0, "kl": 0.00084334, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17094017, "global_step/max_steps": "20/2000", "percentage": "1.00%", "elapsed_time": "13m 32s", "remaining_time": "22h 21m 14s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.024604} {"loss": 6.408e-05, "grad_norm": 1.58302012, "learning_rate": 2.5e-07, "reward": 1.34042969, "reward_std": 0.37527266, "frac_reward_zero_std": 0.4375, "rewards/MazeReward/mean": 0.01625, "rewards/MazeReward/std": 0.05706109, "rewards/MazeFormat/mean": 0.9359375, "rewards/MazeFormat/std": 0.24299971, "rewards/Format/mean": 0.24199219, "rewards/Format/std": 0.04338447, "completions/mean_length": 125.58125, "completions/min_length": 39.6, "completions/max_length": 460.2, "completions/clipped_ratio": 0.0, "kl": 0.00160135, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21367521, "global_step/max_steps": "25/2000", "percentage": "1.25%", "elapsed_time": "16m 34s", "remaining_time": "21h 50m 1s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.025127} {"loss": 0.0001601, "grad_norm": 1.19552164, "learning_rate": 3e-07, "reward": 1.33046875, "reward_std": 0.32056437, "frac_reward_zero_std": 0.5125, "rewards/MazeReward/mean": 0.0134375, "rewards/MazeReward/std": 0.04837796, "rewards/MazeFormat/mean": 0.953125, "rewards/MazeFormat/std": 0.21053163, "rewards/Format/mean": 0.24296875, "rewards/Format/std": 0.04054766, "completions/mean_length": 120.7453125, "completions/min_length": 40.8, "completions/max_length": 452.8, "completions/clipped_ratio": 0.0, "kl": 0.00400126, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25641026, "global_step/max_steps": "30/2000", "percentage": "1.50%", "elapsed_time": "19m 35s", "remaining_time": "21h 26m 16s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.025526} {"loss": 0.00046548, "grad_norm": 1.31537897, "learning_rate": 3.5e-07, "reward": 1.37402344, "reward_std": 0.231683, "frac_reward_zero_std": 0.675, "rewards/MazeReward/mean": 0.0146875, "rewards/MazeReward/std": 0.05107102, "rewards/MazeFormat/mean": 0.978125, "rewards/MazeFormat/std": 0.14282614, "rewards/Format/mean": 0.24902344, "rewards/Format/std": 0.00843505, "completions/mean_length": 106.0109375, "completions/min_length": 36.8, "completions/max_length": 318.6, "completions/clipped_ratio": 0.0, "kl": 0.01163577, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2991453, "global_step/max_steps": "35/2000", "percentage": "1.75%", "elapsed_time": "22m 17s", "remaining_time": "20h 51m 31s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.026168} {"loss": 0.0007581, "grad_norm": 1.51920373, "learning_rate": 4e-07, "reward": 1.39082031, "reward_std": 0.24733528, "frac_reward_zero_std": 0.7, "rewards/MazeReward/mean": 0.015625, "rewards/MazeReward/std": 0.05470489, "rewards/MazeFormat/mean": 0.9859375, "rewards/MazeFormat/std": 0.10508071, "rewards/Format/mean": 0.24863281, "rewards/Format/std": 0.01376431, "completions/mean_length": 107.7546875, "completions/min_length": 39.6, "completions/max_length": 410.4, "completions/clipped_ratio": 0.0, "kl": 0.01894673, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.34188034, "global_step/max_steps": "40/2000", "percentage": "2.00%", "elapsed_time": "25m 13s", "remaining_time": "20h 36m 16s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.026423} {"loss": 0.00078574, "grad_norm": 0.9839755, "learning_rate": 4.5e-07, "reward": 1.37773437, "reward_std": 0.19830012, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.0140625, "rewards/MazeReward/std": 0.04950801, "rewards/MazeFormat/mean": 0.9875, "rewards/MazeFormat/std": 0.09610849, "rewards/Format/mean": 0.24960937, "rewards/Format/std": 0.00441942, "completions/mean_length": 108.796875, "completions/min_length": 41.4, "completions/max_length": 309.4, "completions/clipped_ratio": 0.0, "kl": 0.01963773, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.38461538, "global_step/max_steps": "45/2000", "percentage": "2.25%", "elapsed_time": "27m 57s", "remaining_time": "20h 14m 25s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02683} {"loss": 0.00069186, "grad_norm": 1.23069496, "learning_rate": 5e-07, "reward": 1.35898437, "reward_std": 0.16672341, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.0121875, "rewards/MazeReward/std": 0.04448289, "rewards/MazeFormat/mean": 0.9875, "rewards/MazeFormat/std": 0.09960552, "rewards/Format/mean": 0.24960937, "rewards/Format/std": 0.00441942, "completions/mean_length": 105.29375, "completions/min_length": 40.8, "completions/max_length": 333.0, "completions/clipped_ratio": 0.0, "kl": 0.01729734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.42735043, "global_step/max_steps": "50/2000", "percentage": "2.50%", "elapsed_time": "30m 42s", "remaining_time": "19h 57m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.027136} {"loss": 0.00062353, "grad_norm": 1.09800134, "learning_rate": 5.5e-07, "reward": 1.3109375, "reward_std": 0.12922157, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.006875, "rewards/MazeReward/std": 0.03500962, "rewards/MazeFormat/mean": 0.9921875, "rewards/MazeFormat/std": 0.06573191, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 104.796875, "completions/min_length": 38.6, "completions/max_length": 287.0, "completions/clipped_ratio": 0.0, "kl": 0.01558351, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.47008547, "global_step/max_steps": "55/2000", "percentage": "2.75%", "elapsed_time": "33m 23s", "remaining_time": "19h 41m 3s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.027447} {"loss": 0.00063183, "grad_norm": 1.31291404, "learning_rate": 6e-07, "reward": 1.38730469, "reward_std": 0.20646045, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.0140625, "rewards/MazeReward/std": 0.0472124, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535534, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 99.0171875, "completions/min_length": 41.0, "completions/max_length": 263.8, "completions/clipped_ratio": 0.0, "kl": 0.01579401, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.51282051, "global_step/max_steps": "60/2000", "percentage": "3.00%", "elapsed_time": "36m 1s", "remaining_time": "19h 24m 54s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.027756} {"loss": 0.00051059, "grad_norm": 1.06160874, "learning_rate": 6.5e-07, "reward": 1.39160156, "reward_std": 0.19745953, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.014375, "rewards/MazeReward/std": 0.04834332, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24941406, "rewards/Format/std": 0.00662913, "completions/mean_length": 102.0234375, "completions/min_length": 42.4, "completions/max_length": 372.4, "completions/clipped_ratio": 0.0, "kl": 0.01276027, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.55555556, "global_step/max_steps": "65/2000", "percentage": "3.25%", "elapsed_time": "38m 52s", "remaining_time": "19h 17m 29s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.027862} {"loss": 0.00031079, "grad_norm": 1.35442422, "learning_rate": 7e-07, "reward": 1.44355469, "reward_std": 0.31596591, "frac_reward_zero_std": 0.675, "rewards/MazeReward/mean": 0.0203125, "rewards/MazeReward/std": 0.06453245, "rewards/MazeFormat/mean": 0.990625, "rewards/MazeFormat/std": 0.0851581, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 115.3265625, "completions/min_length": 44.0, "completions/max_length": 363.6, "completions/clipped_ratio": 0.0, "kl": 0.00776671, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5982906, "global_step/max_steps": "70/2000", "percentage": "3.50%", "elapsed_time": "41m 43s", "remaining_time": "19h 10m 27s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02796} {"loss": 0.00028364, "grad_norm": 1.36024269, "learning_rate": 7.5e-07, "reward": 1.43417969, "reward_std": 0.23621928, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.01875, "rewards/MazeReward/std": 0.06105239, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490138, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 115.6609375, "completions/min_length": 46.4, "completions/max_length": 376.6, "completions/clipped_ratio": 0.0, "kl": 0.00708807, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.64102564, "global_step/max_steps": "75/2000", "percentage": "3.75%", "elapsed_time": "44m 38s", "remaining_time": "19h 5m 40s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028004} {"loss": 0.00035074, "grad_norm": 1.35090896, "learning_rate": 8e-07, "reward": 1.4609375, "reward_std": 0.30132957, "frac_reward_zero_std": 0.7, "rewards/MazeReward/mean": 0.0215625, "rewards/MazeReward/std": 0.06277598, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257905, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.4859375, "completions/min_length": 43.0, "completions/max_length": 351.0, "completions/clipped_ratio": 0.0, "kl": 0.00876754, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.68376068, "global_step/max_steps": "80/2000", "percentage": "4.00%", "elapsed_time": "47m 29s", "remaining_time": "18h 59m 44s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028077} {"loss": 0.00046558, "grad_norm": 1.10449499, "learning_rate": 8.5e-07, "reward": 1.3734375, "reward_std": 0.18065466, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.013125, "rewards/MazeReward/std": 0.04797709, "rewards/MazeFormat/mean": 0.9921875, "rewards/MazeFormat/std": 0.07793439, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.1203125, "completions/min_length": 45.0, "completions/max_length": 436.4, "completions/clipped_ratio": 0.0, "kl": 0.01163666, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.72649573, "global_step/max_steps": "85/2000", "percentage": "4.25%", "elapsed_time": "50m 26s", "remaining_time": "18h 56m 22s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028086} {"loss": 0.00097924, "grad_norm": 0.99659877, "learning_rate": 9e-07, "reward": 1.471875, "reward_std": 0.23379422, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.0225, "rewards/MazeReward/std": 0.06349937, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535534, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 88.609375, "completions/min_length": 40.6, "completions/max_length": 337.0, "completions/clipped_ratio": 0.0, "kl": 0.02447981, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.76923077, "global_step/max_steps": "90/2000", "percentage": "4.50%", "elapsed_time": "53m 11s", "remaining_time": "18h 48m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028203} {"loss": 0.00149496, "grad_norm": 1.50519518, "learning_rate": 9.5e-07, "reward": 1.5171875, "reward_std": 0.20643335, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.026875, "rewards/MazeReward/std": 0.06502393, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 79.990625, "completions/min_length": 40.4, "completions/max_length": 321.2, "completions/clipped_ratio": 0.0, "kl": 0.03736886, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.81196581, "global_step/max_steps": "95/2000", "percentage": "4.75%", "elapsed_time": "55m 55s", "remaining_time": "18h 41m 18s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028315} {"loss": 0.00167743, "grad_norm": 1.17337662, "learning_rate": 1e-06, "reward": 1.5046875, "reward_std": 0.18200976, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.025625, "rewards/MazeReward/std": 0.06656526, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 80.1984375, "completions/min_length": 41.2, "completions/max_length": 283.4, "completions/clipped_ratio": 0.0, "kl": 0.04193485, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.85470085, "global_step/max_steps": "100/2000", "percentage": "5.00%", "elapsed_time": "58m 36s", "remaining_time": "18h 33m 40s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028434} {"loss": 0.00139517, "grad_norm": 1.41191642, "learning_rate": 1e-06, "reward": 1.4875, "reward_std": 0.22661331, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.02375, "rewards/MazeReward/std": 0.06286502, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 91.3640625, "completions/min_length": 41.2, "completions/max_length": 310.8, "completions/clipped_ratio": 0.0, "kl": 0.0348784, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.8974359, "global_step/max_steps": "105/2000", "percentage": "5.25%", "elapsed_time": "1h 2m 20s", "remaining_time": "18h 45m 15s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028068} {"loss": 0.00148226, "grad_norm": 1.2686401, "learning_rate": 1e-06, "reward": 1.55625, "reward_std": 0.21021372, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.0309375, "rewards/MazeReward/std": 0.07310352, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535534, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 103.746875, "completions/min_length": 38.6, "completions/max_length": 355.4, "completions/clipped_ratio": 0.0, "kl": 0.03705888, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.94017094, "global_step/max_steps": "110/2000", "percentage": "5.50%", "elapsed_time": "1h 5m 12s", "remaining_time": "18h 40m 20s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028116} {"loss": 0.00158323, "grad_norm": 1.08741372, "learning_rate": 1e-06, "reward": 1.590625, "reward_std": 0.25334451, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.0340625, "rewards/MazeReward/std": 0.07615574, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 103.7015625, "completions/min_length": 42.6, "completions/max_length": 252.2, "completions/clipped_ratio": 0.0, "kl": 0.03958274, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.98290598, "global_step/max_steps": "115/2000", "percentage": "5.75%", "elapsed_time": "1h 7m 51s", "remaining_time": "18h 32m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028242} {"loss": 0.0018615, "grad_norm": 0.71024286, "learning_rate": 1e-06, "reward": 1.5640625, "reward_std": 0.29029042, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.0315625, "rewards/MazeReward/std": 0.07654295, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 104.0, "completions/min_length": 42.2, "completions/max_length": 333.8, "completions/clipped_ratio": 0.0, "kl": 0.04654146, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.02564103, "global_step/max_steps": "120/2000", "percentage": "6.00%", "elapsed_time": "1h 10m 39s", "remaining_time": "18h 27m 3s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028303} {"loss": 0.00231443, "grad_norm": 0.76999869, "learning_rate": 1e-06, "reward": 1.65761719, "reward_std": 0.1521605, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.04125, "rewards/MazeReward/std": 0.07993591, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257905, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 103.3359375, "completions/min_length": 44.2, "completions/max_length": 358.6, "completions/clipped_ratio": 0.0, "kl": 0.05786212, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.06837607, "global_step/max_steps": "125/2000", "percentage": "6.25%", "elapsed_time": "1h 13m 31s", "remaining_time": "18h 22m 48s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028337} {"loss": 0.00292002, "grad_norm": 1.14830795, "learning_rate": 1e-06, "reward": 1.675, "reward_std": 0.12972374, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.0425, "rewards/MazeReward/std": 0.08033716, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 84.5421875, "completions/min_length": 41.4, "completions/max_length": 333.6, "completions/clipped_ratio": 0.0, "kl": 0.07299737, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.11111111, "global_step/max_steps": "130/2000", "percentage": "6.50%", "elapsed_time": "1h 16m 14s", "remaining_time": "18h 16m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028416} {"loss": 0.00290156, "grad_norm": 0.22525601, "learning_rate": 1e-06, "reward": 1.475, "reward_std": 0.06849094, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.0225, "rewards/MazeReward/std": 0.0530306, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 80.1828125, "completions/min_length": 41.2, "completions/max_length": 236.8, "completions/clipped_ratio": 0.0, "kl": 0.07253277, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.15384615, "global_step/max_steps": "135/2000", "percentage": "6.75%", "elapsed_time": "1h 18m 49s", "remaining_time": "18h 9m 0s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028543} {"loss": 0.00234762, "grad_norm": 0.75311237, "learning_rate": 1e-06, "reward": 1.60449219, "reward_std": 0.11650569, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.035625, "rewards/MazeReward/std": 0.0761398, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 92.290625, "completions/min_length": 40.2, "completions/max_length": 293.4, "completions/clipped_ratio": 0.0, "kl": 0.05868288, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.1965812, "global_step/max_steps": "140/2000", "percentage": "7.00%", "elapsed_time": "1h 21m 31s", "remaining_time": "18h 3m 11s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028619} {"loss": 0.00186992, "grad_norm": 0.96558219, "learning_rate": 1e-06, "reward": 1.678125, "reward_std": 0.18339579, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.0428125, "rewards/MazeReward/std": 0.08515361, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 107.709375, "completions/min_length": 40.4, "completions/max_length": 365.8, "completions/clipped_ratio": 0.0, "kl": 0.04674591, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.23931624, "global_step/max_steps": "145/2000", "percentage": "7.25%", "elapsed_time": "1h 24m 21s", "remaining_time": "17h 59m 16s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028646} {"loss": 0.00190117, "grad_norm": 0.84423558, "learning_rate": 1e-06, "reward": 1.5875, "reward_std": 0.11563375, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.03375, "rewards/MazeReward/std": 0.0731412, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 106.034375, "completions/min_length": 39.8, "completions/max_length": 373.0, "completions/clipped_ratio": 0.0, "kl": 0.0475337, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.28205128, "global_step/max_steps": "150/2000", "percentage": "7.50%", "elapsed_time": "1h 27m 13s", "remaining_time": "17h 55m 42s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028663} {"loss": 0.0023825, "grad_norm": 0.20366802, "learning_rate": 1e-06, "reward": 1.76230469, "reward_std": 0.05337072, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.05125, "rewards/MazeReward/std": 0.08585172, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 91.5609375, "completions/min_length": 41.4, "completions/max_length": 340.6, "completions/clipped_ratio": 0.0, "kl": 0.05956588, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.32478632, "global_step/max_steps": "155/2000", "percentage": "7.75%", "elapsed_time": "1h 29m 59s", "remaining_time": "17h 51m 9s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028707} {"loss": 0.00233282, "grad_norm": 0.89883649, "learning_rate": 1e-06, "reward": 1.753125, "reward_std": 0.04218915, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0503125, "rewards/MazeReward/std": 0.08477406, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 93.615625, "completions/min_length": 40.2, "completions/max_length": 329.8, "completions/clipped_ratio": 0.0, "kl": 0.058318, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.36752137, "global_step/max_steps": "160/2000", "percentage": "8.00%", "elapsed_time": "1h 32m 46s", "remaining_time": "17h 46m 56s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028743} {"loss": 0.00197243, "grad_norm": 0.68695134, "learning_rate": 1e-06, "reward": 1.6859375, "reward_std": 0.14878431, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.04375, "rewards/MazeReward/std": 0.08216543, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 96.5296875, "completions/min_length": 41.6, "completions/max_length": 275.0, "completions/clipped_ratio": 0.0, "kl": 0.04930294, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.41025641, "global_step/max_steps": "165/2000", "percentage": "8.25%", "elapsed_time": "1h 35m 28s", "remaining_time": "17h 41m 42s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028806} {"loss": 0.00208327, "grad_norm": 0.20026674, "learning_rate": 1e-06, "reward": 1.575, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.0325, "rewards/MazeReward/std": 0.07358299, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 96.5640625, "completions/min_length": 39.8, "completions/max_length": 289.4, "completions/clipped_ratio": 0.0, "kl": 0.05207844, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.45299145, "global_step/max_steps": "170/2000", "percentage": "8.50%", "elapsed_time": "1h 38m 9s", "remaining_time": "17h 36m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028865} {"loss": 0.00194638, "grad_norm": 0.48721821, "learning_rate": 1e-06, "reward": 1.74375, "reward_std": 0.01767767, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.049375, "rewards/MazeReward/std": 0.08451755, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 105.240625, "completions/min_length": 39.0, "completions/max_length": 306.4, "completions/clipped_ratio": 0.0, "kl": 0.04866266, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.4957265, "global_step/max_steps": "175/2000", "percentage": "8.75%", "elapsed_time": "1h 40m 52s", "remaining_time": "17h 31m 54s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028916} {"loss": 0.00176391, "grad_norm": 0.46120689, "learning_rate": 1e-06, "reward": 1.78261719, "reward_std": 0.0430611, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.0534375, "rewards/MazeReward/std": 0.08518626, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 116.1109375, "completions/min_length": 42.0, "completions/max_length": 333.8, "completions/clipped_ratio": 0.0, "kl": 0.04409476, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.53846154, "global_step/max_steps": "180/2000", "percentage": "9.00%", "elapsed_time": "1h 43m 38s", "remaining_time": "17h 27m 56s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028945} {"loss": 0.00182388, "grad_norm": 0.51481051, "learning_rate": 1e-06, "reward": 1.596875, "reward_std": 0.03198434, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.0346875, "rewards/MazeReward/std": 0.07464145, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.6703125, "completions/min_length": 44.0, "completions/max_length": 417.6, "completions/clipped_ratio": 0.0, "kl": 0.04559737, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.58119658, "global_step/max_steps": "185/2000", "percentage": "9.25%", "elapsed_time": "1h 46m 36s", "remaining_time": "17h 25m 50s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028924} {"loss": 0.00167685, "grad_norm": 0.51389759, "learning_rate": 9.9e-07, "reward": 1.7375, "reward_std": 0.03535534, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.04875, "rewards/MazeReward/std": 0.08519883, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.4046875, "completions/min_length": 42.6, "completions/max_length": 423.0, "completions/clipped_ratio": 0.0, "kl": 0.04191551, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.62393162, "global_step/max_steps": "190/2000", "percentage": "9.50%", "elapsed_time": "1h 49m 31s", "remaining_time": "17h 23m 25s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028911} {"loss": 0.00157113, "grad_norm": 0.90677371, "learning_rate": 9.9e-07, "reward": 1.634375, "reward_std": 0.03808926, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0384375, "rewards/MazeReward/std": 0.07839376, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.6421875, "completions/min_length": 41.0, "completions/max_length": 352.6, "completions/clipped_ratio": 0.0, "kl": 0.03926404, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.66666667, "global_step/max_steps": "195/2000", "percentage": "9.75%", "elapsed_time": "1h 52m 21s", "remaining_time": "17h 20m 1s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028926} {"loss": 0.00166143, "grad_norm": 0.09424511, "learning_rate": 9.9e-07, "reward": 1.840625, "reward_std": 0.02041159, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.0590625, "rewards/MazeReward/std": 0.09025486, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.8484375, "completions/min_length": 40.2, "completions/max_length": 296.2, "completions/clipped_ratio": 0.0, "kl": 0.0415276, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.70940171, "global_step/max_steps": "200/2000", "percentage": "10.00%", "elapsed_time": "1h 55m 2s", "remaining_time": "17h 15m 21s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028976} {"loss": 0.00171922, "grad_norm": 0.67780053, "learning_rate": 9.9e-07, "reward": 1.896875, "reward_std": 0.00883883, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.0646875, "rewards/MazeReward/std": 0.0911929, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.19375, "completions/min_length": 41.6, "completions/max_length": 344.0, "completions/clipped_ratio": 0.0, "kl": 0.04297648, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.75213675, "global_step/max_steps": "205/2000", "percentage": "10.25%", "elapsed_time": "1h 58m 52s", "remaining_time": "17h 20m 57s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02874} {"loss": 0.00147729, "grad_norm": 0.0962596, "learning_rate": 9.9e-07, "reward": 1.55761719, "reward_std": 0.0530512, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0309375, "rewards/MazeReward/std": 0.05911255, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 121.43125, "completions/min_length": 41.4, "completions/max_length": 394.2, "completions/clipped_ratio": 0.0, "kl": 0.03692597, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.79487179, "global_step/max_steps": "210/2000", "percentage": "10.50%", "elapsed_time": "2h 1m 48s", "remaining_time": "17h 18m 20s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028732} {"loss": 0.00159438, "grad_norm": 0.88599002, "learning_rate": 9.9e-07, "reward": 1.715625, "reward_std": 0.0265165, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.0465625, "rewards/MazeReward/std": 0.08200521, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.7890625, "completions/min_length": 40.2, "completions/max_length": 342.8, "completions/clipped_ratio": 0.0, "kl": 0.03985868, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.83760684, "global_step/max_steps": "215/2000", "percentage": "10.75%", "elapsed_time": "2h 4m 37s", "remaining_time": "17h 14m 41s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028752} {"loss": 0.00141457, "grad_norm": 0.38476949, "learning_rate": 9.9e-07, "reward": 1.58261719, "reward_std": 0.0430611, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0334375, "rewards/MazeReward/std": 0.06289278, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 126.36875, "completions/min_length": 42.0, "completions/max_length": 354.8, "completions/clipped_ratio": 0.0, "kl": 0.03536163, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.88034188, "global_step/max_steps": "220/2000", "percentage": "11.00%", "elapsed_time": "2h 7m 26s", "remaining_time": "17h 11m 7s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028771} {"loss": 0.00131763, "grad_norm": 0.10814792, "learning_rate": 9.9e-07, "reward": 1.684375, "reward_std": 0.0222019, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.0434375, "rewards/MazeReward/std": 0.08095957, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.515625, "completions/min_length": 46.0, "completions/max_length": 369.6, "completions/clipped_ratio": 0.0, "kl": 0.03294099, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.92307692, "global_step/max_steps": "225/2000", "percentage": "11.25%", "elapsed_time": "2h 10m 18s", "remaining_time": "17h 8m 3s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028776} {"loss": 0.0011617, "grad_norm": 0.07333539, "learning_rate": 9.9e-07, "reward": 1.571875, "reward_std": 0.00883883, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.0321875, "rewards/MazeReward/std": 0.07254017, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.4390625, "completions/min_length": 47.0, "completions/max_length": 401.4, "completions/clipped_ratio": 0.0, "kl": 0.02903665, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.96581197, "global_step/max_steps": "230/2000", "percentage": "11.50%", "elapsed_time": "2h 13m 16s", "remaining_time": "17h 5m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028762} {"loss": 0.0012318, "grad_norm": 0.09827643, "learning_rate": 9.9e-07, "reward": 1.759375, "reward_std": 0.0306164, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.0509375, "rewards/MazeReward/std": 0.08228155, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.7703125, "completions/min_length": 45.8, "completions/max_length": 362.6, "completions/clipped_ratio": 0.0, "kl": 0.03078949, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.00854701, "global_step/max_steps": "235/2000", "percentage": "11.75%", "elapsed_time": "2h 16m 6s", "remaining_time": "17h 2m 14s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028777} {"loss": 0.00140496, "grad_norm": 0.12057015, "learning_rate": 9.9e-07, "reward": 1.7625, "reward_std": 0.0231455, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.05125, "rewards/MazeReward/std": 0.07912175, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.1171875, "completions/min_length": 46.0, "completions/max_length": 308.4, "completions/clipped_ratio": 0.0, "kl": 0.03512265, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.05128205, "global_step/max_steps": "240/2000", "percentage": "12.00%", "elapsed_time": "2h 18m 52s", "remaining_time": "16h 58m 23s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028804} {"loss": 0.00123478, "grad_norm": 0.09063741, "learning_rate": 9.9e-07, "reward": 1.66875, "reward_std": 0.03104073, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.041875, "rewards/MazeReward/std": 0.0790435, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.7921875, "completions/min_length": 44.0, "completions/max_length": 424.8, "completions/clipped_ratio": 0.0, "kl": 0.03086706, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.09401709, "global_step/max_steps": "245/2000", "percentage": "12.25%", "elapsed_time": "2h 21m 50s", "remaining_time": "16h 55m 59s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02879} {"loss": 0.00121607, "grad_norm": 0.09514702, "learning_rate": 9.8e-07, "reward": 1.61875, "reward_std": 0.04671338, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.036875, "rewards/MazeReward/std": 0.07626683, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.0875, "completions/min_length": 45.4, "completions/max_length": 397.0, "completions/clipped_ratio": 0.0, "kl": 0.03039407, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.13675214, "global_step/max_steps": "250/2000", "percentage": "12.50%", "elapsed_time": "2h 24m 42s", "remaining_time": "16h 52m 58s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028793} {"loss": 0.00133209, "grad_norm": 0.06587226, "learning_rate": 9.8e-07, "reward": 1.890625, "reward_std": 0.02041159, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.0640625, "rewards/MazeReward/std": 0.09125814, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2296875, "completions/min_length": 40.0, "completions/max_length": 342.6, "completions/clipped_ratio": 0.0, "kl": 0.03329704, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.17948718, "global_step/max_steps": "255/2000", "percentage": "12.75%", "elapsed_time": "2h 27m 33s", "remaining_time": "16h 49m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028803} {"loss": 0.00127244, "grad_norm": 1.13677858, "learning_rate": 9.8e-07, "reward": 1.80253906, "reward_std": 0.05131929, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.055625, "rewards/MazeReward/std": 0.08506897, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490138, "rewards/Format/mean": 0.24941406, "rewards/Format/std": 0.00379707, "completions/mean_length": 135.5328125, "completions/min_length": 45.2, "completions/max_length": 359.0, "completions/clipped_ratio": 0.0, "kl": 0.03180913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.22222222, "global_step/max_steps": "260/2000", "percentage": "13.00%", "elapsed_time": "2h 30m 21s", "remaining_time": "16h 46m 15s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028819} {"loss": 0.00125275, "grad_norm": 0.09204891, "learning_rate": 9.8e-07, "reward": 1.5125, "reward_std": 0.03535534, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.02625, "rewards/MazeReward/std": 0.05746038, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.78125, "completions/min_length": 42.6, "completions/max_length": 349.4, "completions/clipped_ratio": 0.0, "kl": 0.03131272, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.26495726, "global_step/max_steps": "265/2000", "percentage": "13.25%", "elapsed_time": "2h 33m 12s", "remaining_time": "16h 43m 6s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028827} {"loss": 0.00133742, "grad_norm": 0.10430657, "learning_rate": 9.8e-07, "reward": 1.646875, "reward_std": 0.00883883, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.0396875, "rewards/MazeReward/std": 0.07787279, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.8, "completions/min_length": 41.6, "completions/max_length": 355.8, "completions/clipped_ratio": 0.0, "kl": 0.0334332, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.30769231, "global_step/max_steps": "270/2000", "percentage": "13.50%", "elapsed_time": "2h 36m 4s", "remaining_time": "16h 40m 3s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028832} {"loss": 0.0010987, "grad_norm": 0.60385845, "learning_rate": 9.8e-07, "reward": 1.58535156, "reward_std": 0.01943976, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.03375, "rewards/MazeReward/std": 0.0672236, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24941406, "rewards/Format/std": 0.00662913, "completions/mean_length": 119.921875, "completions/min_length": 41.2, "completions/max_length": 353.6, "completions/clipped_ratio": 0.0, "kl": 0.0274647, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.35042735, "global_step/max_steps": "275/2000", "percentage": "13.75%", "elapsed_time": "2h 38m 54s", "remaining_time": "16h 36m 50s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028841} {"loss": 0.00118281, "grad_norm": 0.11135024, "learning_rate": 9.8e-07, "reward": 1.528125, "reward_std": 0.0306164, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.0278125, "rewards/MazeReward/std": 0.06877715, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 109.83125, "completions/min_length": 39.0, "completions/max_length": 338.0, "completions/clipped_ratio": 0.0, "kl": 0.02957004, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.39316239, "global_step/max_steps": "280/2000", "percentage": "14.00%", "elapsed_time": "2h 41m 40s", "remaining_time": "16h 33m 8s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028865} {"loss": 0.0012409, "grad_norm": 0.08211805, "learning_rate": 9.8e-07, "reward": 1.66875, "reward_std": 0.01157275, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.041875, "rewards/MazeReward/std": 0.08127498, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 105.146875, "completions/min_length": 44.0, "completions/max_length": 337.2, "completions/clipped_ratio": 0.0, "kl": 0.03102337, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.43589744, "global_step/max_steps": "285/2000", "percentage": "14.25%", "elapsed_time": "2h 44m 28s", "remaining_time": "16h 29m 44s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028879} {"loss": 0.00115531, "grad_norm": 0.08713728, "learning_rate": 9.8e-07, "reward": 1.5734375, "reward_std": 0.00441942, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.0325, "rewards/MazeReward/std": 0.07290461, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.95, "completions/min_length": 44.0, "completions/max_length": 347.2, "completions/clipped_ratio": 0.0, "kl": 0.02888094, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.47863248, "global_step/max_steps": "290/2000", "percentage": "14.50%", "elapsed_time": "2h 47m 20s", "remaining_time": "16h 26m 44s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028883} {"loss": 0.00120709, "grad_norm": 0.62250142, "learning_rate": 9.7e-07, "reward": 1.7078125, "reward_std": 0.04861359, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0459375, "rewards/MazeReward/std": 0.07529201, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.6546875, "completions/min_length": 43.0, "completions/max_length": 364.6, "completions/clipped_ratio": 0.0, "kl": 0.03017933, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.52136752, "global_step/max_steps": "295/2000", "percentage": "14.75%", "elapsed_time": "2h 50m 9s", "remaining_time": "16h 23m 29s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028894} {"loss": 0.00121489, "grad_norm": 0.10214096, "learning_rate": 9.7e-07, "reward": 1.646875, "reward_std": 0.00883883, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.0396875, "rewards/MazeReward/std": 0.07787279, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.06875, "completions/min_length": 44.0, "completions/max_length": 337.2, "completions/clipped_ratio": 0.0, "kl": 0.03037008, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.56410256, "global_step/max_steps": "300/2000", "percentage": "15.00%", "elapsed_time": "2h 52m 58s", "remaining_time": "16h 20m 14s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028905} {"loss": 0.00107706, "grad_norm": 0.07743791, "learning_rate": 9.7e-07, "reward": 1.6828125, "reward_std": 0.03503582, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0434375, "rewards/MazeReward/std": 0.08254079, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 111.646875, "completions/min_length": 42.2, "completions/max_length": 334.6, "completions/clipped_ratio": 0.0, "kl": 0.02692441, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.60683761, "global_step/max_steps": "305/2000", "percentage": "15.25%", "elapsed_time": "2h 56m 52s", "remaining_time": "16h 22m 59s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028739} {"loss": 0.00125578, "grad_norm": 0.59949103, "learning_rate": 9.7e-07, "reward": 1.80761719, "reward_std": 0.03558824, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0559375, "rewards/MazeReward/std": 0.0893729, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 109.6625, "completions/min_length": 41.0, "completions/max_length": 319.2, "completions/clipped_ratio": 0.0, "kl": 0.03139304, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.64957265, "global_step/max_steps": "310/2000", "percentage": "15.50%", "elapsed_time": "2h 59m 39s", "remaining_time": "16h 19m 23s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02876} {"loss": 0.00126479, "grad_norm": 0.51984292, "learning_rate": 9.7e-07, "reward": 1.659375, "reward_std": 0.03808926, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0409375, "rewards/MazeReward/std": 0.07862304, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 103.4078125, "completions/min_length": 44.0, "completions/max_length": 359.2, "completions/clipped_ratio": 0.0, "kl": 0.03161529, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.69230769, "global_step/max_steps": "315/2000", "percentage": "15.75%", "elapsed_time": "3h 2m 28s", "remaining_time": "16h 16m 4s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028772} {"loss": 0.00145139, "grad_norm": 0.08884393, "learning_rate": 9.7e-07, "reward": 1.925, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.0675, "rewards/MazeReward/std": 0.09291007, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 98.4875, "completions/min_length": 40.2, "completions/max_length": 353.0, "completions/clipped_ratio": 0.0, "kl": 0.03628601, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.73504274, "global_step/max_steps": "320/2000", "percentage": "16.00%", "elapsed_time": "3h 5m 18s", "remaining_time": "16h 12m 49s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028782} {"loss": 0.00122055, "grad_norm": 0.10636528, "learning_rate": 9.7e-07, "reward": 1.675, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.0425, "rewards/MazeReward/std": 0.07974326, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.2421875, "completions/min_length": 42.2, "completions/max_length": 327.2, "completions/clipped_ratio": 0.0, "kl": 0.03051017, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.77777778, "global_step/max_steps": "325/2000", "percentage": "16.25%", "elapsed_time": "3h 8m 0s", "remaining_time": "16h 8m 57s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028811} {"loss": 0.0011793, "grad_norm": 0.06943369, "learning_rate": 9.6e-07, "reward": 1.70625, "reward_std": 0.0612328, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.045625, "rewards/MazeReward/std": 0.07920215, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 141.16875, "completions/min_length": 47.0, "completions/max_length": 381.4, "completions/clipped_ratio": 0.0, "kl": 0.02947971, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.82051282, "global_step/max_steps": "330/2000", "percentage": "16.50%", "elapsed_time": "3h 10m 50s", "remaining_time": "16h 5m 46s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02882} {"loss": 0.00129462, "grad_norm": 0.07015473, "learning_rate": 9.6e-07, "reward": 1.671875, "reward_std": 0.00883883, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.0421875, "rewards/MazeReward/std": 0.08093533, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.9328125, "completions/min_length": 43.2, "completions/max_length": 373.4, "completions/clipped_ratio": 0.0, "kl": 0.03236345, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.86324786, "global_step/max_steps": "335/2000", "percentage": "16.75%", "elapsed_time": "3h 13m 39s", "remaining_time": "16h 2m 29s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028832} {"loss": 0.00122257, "grad_norm": 0.07766875, "learning_rate": 9.6e-07, "reward": 1.64375, "reward_std": 0.01767767, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.039375, "rewards/MazeReward/std": 0.07964256, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.1890625, "completions/min_length": 41.2, "completions/max_length": 346.4, "completions/clipped_ratio": 0.0, "kl": 0.03055897, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.90598291, "global_step/max_steps": "340/2000", "percentage": "17.00%", "elapsed_time": "3h 16m 26s", "remaining_time": "15h 59m 4s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028847} {"loss": 0.00130218, "grad_norm": 0.50038673, "learning_rate": 9.6e-07, "reward": 1.7, "reward_std": 0.03535534, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.045, "rewards/MazeReward/std": 0.07816829, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.2296875, "completions/min_length": 43.8, "completions/max_length": 338.6, "completions/clipped_ratio": 0.0, "kl": 0.0325498, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.94871795, "global_step/max_steps": "345/2000", "percentage": "17.25%", "elapsed_time": "3h 19m 15s", "remaining_time": "15h 55m 51s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028857} {"loss": 0.00115808, "grad_norm": 0.0688774, "learning_rate": 9.6e-07, "reward": 1.7109375, "reward_std": 0.03366984, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.04625, "rewards/MazeReward/std": 0.08034941, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.540625, "completions/min_length": 42.8, "completions/max_length": 371.6, "completions/clipped_ratio": 0.0, "kl": 0.02894459, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.99145299, "global_step/max_steps": "350/2000", "percentage": "17.50%", "elapsed_time": "3h 22m 6s", "remaining_time": "15h 52m 45s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028863} {"loss": 0.00115226, "grad_norm": 0.41790465, "learning_rate": 9.6e-07, "reward": 1.51386719, "reward_std": 0.04716099, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0265625, "rewards/MazeReward/std": 0.0656324, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 135.721875, "completions/min_length": 46.0, "completions/max_length": 388.0, "completions/clipped_ratio": 0.0, "kl": 0.02880413, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.03418803, "global_step/max_steps": "355/2000", "percentage": "17.75%", "elapsed_time": "3h 25m 1s", "remaining_time": "15h 50m 0s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028859} {"loss": 0.00100384, "grad_norm": 0.68257115, "learning_rate": 9.5e-07, "reward": 1.61699219, "reward_std": 0.02264951, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.036875, "rewards/MazeReward/std": 0.07546763, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 120.253125, "completions/min_length": 45.0, "completions/max_length": 381.8, "completions/clipped_ratio": 0.0, "kl": 0.02509302, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.07692308, "global_step/max_steps": "360/2000", "percentage": "18.00%", "elapsed_time": "3h 27m 53s", "remaining_time": "15h 47m 4s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028861} {"loss": 0.00113751, "grad_norm": 0.07988618, "learning_rate": 9.5e-07, "reward": 1.725, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.0475, "rewards/MazeReward/std": 0.08356984, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 112.9390625, "completions/min_length": 42.0, "completions/max_length": 298.6, "completions/clipped_ratio": 0.0, "kl": 0.02843865, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.11965812, "global_step/max_steps": "365/2000", "percentage": "18.25%", "elapsed_time": "3h 30m 35s", "remaining_time": "15h 43m 20s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028887} {"loss": 0.00123121, "grad_norm": 0.62774921, "learning_rate": 9.5e-07, "reward": 1.68125, "reward_std": 0.05303301, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.043125, "rewards/MazeReward/std": 0.0824027, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.5203125, "completions/min_length": 45.0, "completions/max_length": 336.2, "completions/clipped_ratio": 0.0, "kl": 0.03078085, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.16239316, "global_step/max_steps": "370/2000", "percentage": "18.50%", "elapsed_time": "3h 33m 23s", "remaining_time": "15h 40m 4s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028898} {"loss": 0.00099771, "grad_norm": 0.50326969, "learning_rate": 9.5e-07, "reward": 1.84824219, "reward_std": 0.15122395, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.06, "rewards/MazeReward/std": 0.0921973, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 119.6640625, "completions/min_length": 45.0, "completions/max_length": 680.4, "completions/clipped_ratio": 0.0015625, "kl": 0.02494008, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.20512821, "global_step/max_steps": "375/2000", "percentage": "18.75%", "elapsed_time": "3h 36m 48s", "remaining_time": "15h 39m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028827} {"loss": 0.0010778, "grad_norm": 0.14253494, "learning_rate": 9.5e-07, "reward": 1.675, "reward_std": 0.0571329, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.0425, "rewards/MazeReward/std": 0.07458726, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 106.1796875, "completions/min_length": 42.6, "completions/max_length": 303.8, "completions/clipped_ratio": 0.0, "kl": 0.02694298, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.24786325, "global_step/max_steps": "380/2000", "percentage": "19.00%", "elapsed_time": "3h 39m 29s", "remaining_time": "15h 35m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028854} {"loss": 0.00129507, "grad_norm": 0.36664148, "learning_rate": 9.5e-07, "reward": 1.67304688, "reward_std": 0.03996608, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0425, "rewards/MazeReward/std": 0.08174438, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24960937, "rewards/Format/std": 0.00441942, "completions/mean_length": 98.975, "completions/min_length": 42.2, "completions/max_length": 270.4, "completions/clipped_ratio": 0.0, "kl": 0.03238046, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.29059829, "global_step/max_steps": "385/2000", "percentage": "19.25%", "elapsed_time": "3h 42m 5s", "remaining_time": "15h 31m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028891} {"loss": 0.00131923, "grad_norm": 0.11241273, "learning_rate": 9.4e-07, "reward": 1.771875, "reward_std": 0.05576692, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0521875, "rewards/MazeReward/std": 0.08882527, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 99.20625, "completions/min_length": 42.4, "completions/max_length": 304.0, "completions/clipped_ratio": 0.0, "kl": 0.03298132, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.33333333, "global_step/max_steps": "390/2000", "percentage": "19.50%", "elapsed_time": "3h 44m 46s", "remaining_time": "15h 27m 55s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028918} {"loss": 0.00121382, "grad_norm": 0.47865036, "learning_rate": 9.4e-07, "reward": 1.6140625, "reward_std": 0.03093592, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0365625, "rewards/MazeReward/std": 0.07630383, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 109.65, "completions/min_length": 42.0, "completions/max_length": 320.2, "completions/clipped_ratio": 0.0, "kl": 0.03033711, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.37606838, "global_step/max_steps": "395/2000", "percentage": "19.75%", "elapsed_time": "3h 47m 31s", "remaining_time": "15h 24m 28s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028935} {"loss": 0.00114561, "grad_norm": 0.44706224, "learning_rate": 9.4e-07, "reward": 1.621875, "reward_std": 0.09722718, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.0371875, "rewards/MazeReward/std": 0.08185895, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.1140625, "completions/min_length": 42.8, "completions/max_length": 322.0, "completions/clipped_ratio": 0.0, "kl": 0.02863241, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.41880342, "global_step/max_steps": "400/2000", "percentage": "20.00%", "elapsed_time": "3h 50m 15s", "remaining_time": "15h 21m 1s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028953} {"loss": 0.00120372, "grad_norm": 0.41086301, "learning_rate": 9.4e-07, "reward": 1.746875, "reward_std": 0.06733968, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.0496875, "rewards/MazeReward/std": 0.08626897, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.16875, "completions/min_length": 46.8, "completions/max_length": 346.6, "completions/clipped_ratio": 0.0, "kl": 0.03009376, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.46153846, "global_step/max_steps": "405/2000", "percentage": "20.25%", "elapsed_time": "3h 54m 13s", "remaining_time": "15h 22m 27s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028818} {"loss": 0.00123858, "grad_norm": 0.4830924, "learning_rate": 9.4e-07, "reward": 1.659375, "reward_std": 0.06733968, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0409375, "rewards/MazeReward/std": 0.0811727, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.2375, "completions/min_length": 41.0, "completions/max_length": 379.6, "completions/clipped_ratio": 0.0, "kl": 0.03096097, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.5042735, "global_step/max_steps": "410/2000", "percentage": "20.50%", "elapsed_time": "3h 57m 6s", "remaining_time": "15h 19m 31s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028819} {"loss": 0.00138605, "grad_norm": 0.07792719, "learning_rate": 9.3e-07, "reward": 1.80625, "reward_std": 0.05303301, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.055625, "rewards/MazeReward/std": 0.08941979, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.11875, "completions/min_length": 42.6, "completions/max_length": 374.6, "completions/clipped_ratio": 0.0, "kl": 0.03464255, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.54700855, "global_step/max_steps": "415/2000", "percentage": "20.75%", "elapsed_time": "3h 59m 59s", "remaining_time": "15h 16m 35s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02882} {"loss": 0.00128503, "grad_norm": 0.07012098, "learning_rate": 9.3e-07, "reward": 1.721875, "reward_std": 0.00883883, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.0471875, "rewards/MazeReward/std": 0.07888222, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.7765625, "completions/min_length": 44.2, "completions/max_length": 395.6, "completions/clipped_ratio": 0.0, "kl": 0.03212045, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.58974359, "global_step/max_steps": "420/2000", "percentage": "21.00%", "elapsed_time": "4h 2m 54s", "remaining_time": "15h 13m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028818} {"loss": 0.00120007, "grad_norm": 0.06630152, "learning_rate": 9.3e-07, "reward": 1.55, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.03, "rewards/MazeReward/std": 0.06700893, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 138.6421875, "completions/min_length": 42.2, "completions/max_length": 388.8, "completions/clipped_ratio": 0.0, "kl": 0.03000038, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.63247863, "global_step/max_steps": "425/2000", "percentage": "21.25%", "elapsed_time": "4h 5m 47s", "remaining_time": "15h 10m 52s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028819} {"loss": 0.00108277, "grad_norm": 0.64906595, "learning_rate": 9.3e-07, "reward": 1.71875, "reward_std": 0.04692809, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.046875, "rewards/MazeReward/std": 0.08021234, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.49375, "completions/min_length": 41.2, "completions/max_length": 438.4, "completions/clipped_ratio": 0.0, "kl": 0.02706159, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.67521368, "global_step/max_steps": "430/2000", "percentage": "21.50%", "elapsed_time": "4h 8m 48s", "remaining_time": "15h 8m 27s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028803} {"loss": 0.00097529, "grad_norm": 0.06958013, "learning_rate": 9.3e-07, "reward": 1.653125, "reward_std": 0.04419417, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0403125, "rewards/MazeReward/std": 0.07667018, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.9046875, "completions/min_length": 46.2, "completions/max_length": 402.6, "completions/clipped_ratio": 0.0, "kl": 0.02437833, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.71794872, "global_step/max_steps": "435/2000", "percentage": "21.75%", "elapsed_time": "4h 11m 47s", "remaining_time": "15h 5m 52s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028794} {"loss": 0.00110604, "grad_norm": 0.42095013, "learning_rate": 9.2e-07, "reward": 1.778125, "reward_std": 0.09722718, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0528125, "rewards/MazeReward/std": 0.09043157, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.5953125, "completions/min_length": 40.6, "completions/max_length": 398.4, "completions/clipped_ratio": 0.0, "kl": 0.02764447, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.76068376, "global_step/max_steps": "440/2000", "percentage": "22.00%", "elapsed_time": "4h 14m 43s", "remaining_time": "15h 3m 5s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02879} {"loss": 0.00131747, "grad_norm": 0.61378116, "learning_rate": 9.2e-07, "reward": 1.853125, "reward_std": 0.11900474, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.0603125, "rewards/MazeReward/std": 0.09812328, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.809375, "completions/min_length": 40.6, "completions/max_length": 364.2, "completions/clipped_ratio": 0.0, "kl": 0.03292817, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.8034188, "global_step/max_steps": "445/2000", "percentage": "22.25%", "elapsed_time": "4h 17m 31s", "remaining_time": "14h 59m 53s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.0288} {"loss": 0.00121272, "grad_norm": 0.11178824, "learning_rate": 9.2e-07, "reward": 1.6234375, "reward_std": 0.03977476, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0378125, "rewards/MazeReward/std": 0.07775698, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257905, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.4984375, "completions/min_length": 39.2, "completions/max_length": 1026.6, "completions/clipped_ratio": 0.003125, "kl": 0.03031667, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.84615385, "global_step/max_steps": "450/2000", "percentage": "22.50%", "elapsed_time": "4h 21m 39s", "remaining_time": "15h 1m 17s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028663} {"loss": 0.001292, "grad_norm": 0.11069044, "learning_rate": 9.2e-07, "reward": 1.68125, "reward_std": 0.01767767, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.043125, "rewards/MazeReward/std": 0.07953909, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 105.140625, "completions/min_length": 38.6, "completions/max_length": 341.8, "completions/clipped_ratio": 0.0, "kl": 0.03229145, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.88888889, "global_step/max_steps": "455/2000", "percentage": "22.75%", "elapsed_time": "4h 24m 27s", "remaining_time": "14h 57m 59s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028675} {"loss": 0.00108913, "grad_norm": 0.07418347, "learning_rate": 9.1e-07, "reward": 1.684375, "reward_std": 0.06187184, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0434375, "rewards/MazeReward/std": 0.08347698, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.80625, "completions/min_length": 41.6, "completions/max_length": 426.8, "completions/clipped_ratio": 0.0, "kl": 0.02722254, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.93162393, "global_step/max_steps": "460/2000", "percentage": "23.00%", "elapsed_time": "4h 27m 25s", "remaining_time": "14h 55m 17s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028668} {"loss": 0.00115296, "grad_norm": 0.45139617, "learning_rate": 9.1e-07, "reward": 1.671875, "reward_std": 0.06733968, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0421875, "rewards/MazeReward/std": 0.08248565, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.1578125, "completions/min_length": 42.2, "completions/max_length": 387.4, "completions/clipped_ratio": 0.0, "kl": 0.02881626, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.97435897, "global_step/max_steps": "465/2000", "percentage": "23.25%", "elapsed_time": "4h 30m 18s", "remaining_time": "14h 52m 18s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028671} {"loss": 0.00117569, "grad_norm": 0.07745948, "learning_rate": 9.1e-07, "reward": 1.75625, "reward_std": 0.01767767, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.050625, "rewards/MazeReward/std": 0.08348545, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.49375, "completions/min_length": 42.2, "completions/max_length": 383.6, "completions/clipped_ratio": 0.0, "kl": 0.02938625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.01709402, "global_step/max_steps": "470/2000", "percentage": "23.50%", "elapsed_time": "4h 33m 12s", "remaining_time": "14h 49m 22s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028672} {"loss": 0.00092224, "grad_norm": 0.49465612, "learning_rate": 9.1e-07, "reward": 1.659375, "reward_std": 0.04419417, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.0409375, "rewards/MazeReward/std": 0.08035239, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.2203125, "completions/min_length": 45.6, "completions/max_length": 411.8, "completions/clipped_ratio": 0.0, "kl": 0.02305245, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.05982906, "global_step/max_steps": "475/2000", "percentage": "23.75%", "elapsed_time": "4h 36m 8s", "remaining_time": "14h 46m 34s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028668} {"loss": 0.00094219, "grad_norm": 0.38044152, "learning_rate": 9e-07, "reward": 1.78730469, "reward_std": 0.03590777, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.05375, "rewards/MazeReward/std": 0.08982029, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 139.25625, "completions/min_length": 46.4, "completions/max_length": 397.4, "completions/clipped_ratio": 0.0, "kl": 0.02354995, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.1025641, "global_step/max_steps": "480/2000", "percentage": "24.00%", "elapsed_time": "4h 39m 1s", "remaining_time": "14h 43m 33s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028672} {"loss": 0.00092688, "grad_norm": 0.05711981, "learning_rate": 9e-07, "reward": 1.7421875, "reward_std": 0.07312507, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.049375, "rewards/MazeReward/std": 0.08435956, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.678125, "completions/min_length": 42.8, "completions/max_length": 399.2, "completions/clipped_ratio": 0.0, "kl": 0.02316641, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.14529915, "global_step/max_steps": "485/2000", "percentage": "24.25%", "elapsed_time": "4h 41m 53s", "remaining_time": "14h 40m 32s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028675} {"loss": 0.00110707, "grad_norm": 0.0658526, "learning_rate": 9e-07, "reward": 1.765625, "reward_std": 0.04419417, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.0515625, "rewards/MazeReward/std": 0.08912842, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.6171875, "completions/min_length": 43.2, "completions/max_length": 398.0, "completions/clipped_ratio": 0.0, "kl": 0.02767113, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.18803419, "global_step/max_steps": "490/2000", "percentage": "24.50%", "elapsed_time": "4h 44m 48s", "remaining_time": "14h 37m 40s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028674} {"loss": 0.00105657, "grad_norm": 0.07195954, "learning_rate": 9e-07, "reward": 1.653125, "reward_std": 0.07954951, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0403125, "rewards/MazeReward/std": 0.0819414, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.1890625, "completions/min_length": 47.0, "completions/max_length": 413.6, "completions/clipped_ratio": 0.0, "kl": 0.02640679, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.23076923, "global_step/max_steps": "495/2000", "percentage": "24.75%", "elapsed_time": "4h 47m 43s", "remaining_time": "14h 34m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028674} {"loss": 0.00130335, "grad_norm": 0.77816813, "learning_rate": 8.9e-07, "reward": 1.8734375, "reward_std": 0.1281631, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.0625, "rewards/MazeReward/std": 0.09654749, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 145.796875, "completions/min_length": 46.4, "completions/max_length": 362.6, "completions/clipped_ratio": 0.0, "kl": 0.03257816, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.27350427, "global_step/max_steps": "500/2000", "percentage": "25.00%", "elapsed_time": "4h 50m 32s", "remaining_time": "14h 31m 36s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028683} {"loss": 0.00151626, "grad_norm": 0.35789182, "learning_rate": 8.9e-07, "reward": 1.68125, "reward_std": 0.15235702, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.043125, "rewards/MazeReward/std": 0.08651674, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 137.4609375, "completions/min_length": 46.6, "completions/max_length": 359.8, "completions/clipped_ratio": 0.0, "kl": 0.03791391, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.31623932, "global_step/max_steps": "505/2000", "percentage": "25.25%", "elapsed_time": "4h 54m 24s", "remaining_time": "14h 31m 33s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028589} {"loss": 0.00204518, "grad_norm": 0.09537837, "learning_rate": 8.9e-07, "reward": 1.75, "reward_std": 0.08984613, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.05, "rewards/MazeReward/std": 0.0900719, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.7265625, "completions/min_length": 46.4, "completions/max_length": 450.0, "completions/clipped_ratio": 0.0, "kl": 0.05111516, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.35897436, "global_step/max_steps": "510/2000", "percentage": "25.50%", "elapsed_time": "4h 57m 25s", "remaining_time": "14h 28m 58s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028578} {"loss": 0.00160847, "grad_norm": 0.48245592, "learning_rate": 8.9e-07, "reward": 1.671875, "reward_std": 0.12037268, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.0421875, "rewards/MazeReward/std": 0.08702818, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.690625, "completions/min_length": 46.0, "completions/max_length": 432.6, "completions/clipped_ratio": 0.0, "kl": 0.0402068, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.4017094, "global_step/max_steps": "515/2000", "percentage": "25.75%", "elapsed_time": "5h 0m 24s", "remaining_time": "14h 26m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028573} {"loss": 0.00128271, "grad_norm": 0.85770947, "learning_rate": 8.8e-07, "reward": 1.6953125, "reward_std": 0.22211109, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.0446875, "rewards/MazeReward/std": 0.09304452, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 146.3796875, "completions/min_length": 45.8, "completions/max_length": 430.6, "completions/clipped_ratio": 0.0, "kl": 0.03206656, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.44444444, "global_step/max_steps": "520/2000", "percentage": "26.00%", "elapsed_time": "5h 3m 23s", "remaining_time": "14h 23m 29s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028566} {"loss": 0.00132567, "grad_norm": 0.10337241, "learning_rate": 8.8e-07, "reward": 1.803125, "reward_std": 0.13762092, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0553125, "rewards/MazeReward/std": 0.0984746, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 153.4734375, "completions/min_length": 52.4, "completions/max_length": 485.2, "completions/clipped_ratio": 0.0, "kl": 0.03314277, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.48717949, "global_step/max_steps": "525/2000", "percentage": "26.25%", "elapsed_time": "5h 6m 26s", "remaining_time": "14h 20m 56s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028554} {"loss": 0.00150939, "grad_norm": 0.78221202, "learning_rate": 8.8e-07, "reward": 1.834375, "reward_std": 0.18980527, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.0584375, "rewards/MazeReward/std": 0.09915196, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 158.69375, "completions/min_length": 48.2, "completions/max_length": 485.6, "completions/clipped_ratio": 0.0, "kl": 0.03773374, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.52991453, "global_step/max_steps": "530/2000", "percentage": "26.50%", "elapsed_time": "5h 9m 31s", "remaining_time": "14h 18m 29s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028538} {"loss": 0.00179396, "grad_norm": 0.78056503, "learning_rate": 8.8e-07, "reward": 1.821875, "reward_std": 0.19949473, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.0571875, "rewards/MazeReward/std": 0.10765503, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 148.371875, "completions/min_length": 44.8, "completions/max_length": 423.0, "completions/clipped_ratio": 0.0, "kl": 0.04483983, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.57264957, "global_step/max_steps": "535/2000", "percentage": "26.75%", "elapsed_time": "5h 12m 28s", "remaining_time": "14h 15m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028535} {"loss": 0.00217172, "grad_norm": 0.49202598, "learning_rate": 8.7e-07, "reward": 1.728125, "reward_std": 0.20894073, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.0478125, "rewards/MazeReward/std": 0.09794876, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 141.103125, "completions/min_length": 48.0, "completions/max_length": 418.2, "completions/clipped_ratio": 0.0, "kl": 0.05429116, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.61538462, "global_step/max_steps": "540/2000", "percentage": "27.00%", "elapsed_time": "5h 15m 24s", "remaining_time": "14h 12m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028534} {"loss": 0.0018487, "grad_norm": 0.0999729, "learning_rate": 8.7e-07, "reward": 1.859375, "reward_std": 0.16687135, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.0609375, "rewards/MazeReward/std": 0.10001743, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.7015625, "completions/min_length": 46.0, "completions/max_length": 312.6, "completions/clipped_ratio": 0.0, "kl": 0.04621434, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.65811966, "global_step/max_steps": "545/2000", "percentage": "27.25%", "elapsed_time": "5h 18m 10s", "remaining_time": "14h 9m 27s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028548} {"loss": 0.00181476, "grad_norm": 1.1045319, "learning_rate": 8.7e-07, "reward": 1.76875, "reward_std": 0.23673532, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.051875, "rewards/MazeReward/std": 0.0953328, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.375, "completions/min_length": 47.4, "completions/max_length": 329.2, "completions/clipped_ratio": 0.0, "kl": 0.04536463, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.7008547, "global_step/max_steps": "550/2000", "percentage": "27.50%", "elapsed_time": "5h 21m 0s", "remaining_time": "14h 6m 17s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028556} {"loss": 0.00268573, "grad_norm": 0.71778738, "learning_rate": 8.7e-07, "reward": 1.796875, "reward_std": 0.28869986, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.0546875, "rewards/MazeReward/std": 0.10365346, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.9046875, "completions/min_length": 45.4, "completions/max_length": 365.2, "completions/clipped_ratio": 0.0, "kl": 0.06714675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.74358974, "global_step/max_steps": "555/2000", "percentage": "27.75%", "elapsed_time": "5h 23m 51s", "remaining_time": "14h 3m 11s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028562} {"loss": 0.00310747, "grad_norm": 1.45827893, "learning_rate": 8.6e-07, "reward": 2.04375, "reward_std": 0.49395994, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.079375, "rewards/MazeReward/std": 0.13558758, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.7328125, "completions/min_length": 45.4, "completions/max_length": 385.0, "completions/clipped_ratio": 0.0, "kl": 0.0776826, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.78632479, "global_step/max_steps": "560/2000", "percentage": "28.00%", "elapsed_time": "5h 26m 46s", "remaining_time": "14h 0m 16s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028562} {"loss": 0.00376823, "grad_norm": 0.5188678, "learning_rate": 8.6e-07, "reward": 1.940625, "reward_std": 0.25565097, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.0690625, "rewards/MazeReward/std": 0.11698136, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.7515625, "completions/min_length": 45.2, "completions/max_length": 427.8, "completions/clipped_ratio": 0.0, "kl": 0.09420239, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.82905983, "global_step/max_steps": "565/2000", "percentage": "28.25%", "elapsed_time": "5h 29m 43s", "remaining_time": "13h 57m 26s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028559} {"loss": 0.00383922, "grad_norm": 0.90886876, "learning_rate": 8.6e-07, "reward": 2.059375, "reward_std": 0.38991107, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.0809375, "rewards/MazeReward/std": 0.13103084, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.4171875, "completions/min_length": 45.4, "completions/max_length": 324.2, "completions/clipped_ratio": 0.0, "kl": 0.0959618, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.87179487, "global_step/max_steps": "570/2000", "percentage": "28.50%", "elapsed_time": "5h 32m 28s", "remaining_time": "13h 54m 6s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028573} {"loss": 0.00400726, "grad_norm": 0.92080859, "learning_rate": 8.5e-07, "reward": 1.965625, "reward_std": 0.26755938, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.0715625, "rewards/MazeReward/std": 0.12371677, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.2703125, "completions/min_length": 47.2, "completions/max_length": 315.8, "completions/clipped_ratio": 0.0, "kl": 0.10018278, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.91452991, "global_step/max_steps": "575/2000", "percentage": "28.75%", "elapsed_time": "5h 35m 13s", "remaining_time": "13h 50m 46s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028588} {"loss": 0.00403678, "grad_norm": 0.6640475, "learning_rate": 8.5e-07, "reward": 2.1015625, "reward_std": 0.29669989, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.0853125, "rewards/MazeReward/std": 0.12427427, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.0890625, "completions/min_length": 44.0, "completions/max_length": 336.8, "completions/clipped_ratio": 0.0, "kl": 0.10094725, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.95726496, "global_step/max_steps": "580/2000", "percentage": "29.00%", "elapsed_time": "5h 37m 58s", "remaining_time": "13h 47m 27s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028601} {"loss": 0.00372701, "grad_norm": 1.16636941, "learning_rate": 8.5e-07, "reward": 2.028125, "reward_std": 0.45681936, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.0778125, "rewards/MazeReward/std": 0.13334925, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.65625, "completions/min_length": 51.8, "completions/max_length": 414.4, "completions/clipped_ratio": 0.0, "kl": 0.09316067, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.0, "global_step/max_steps": "585/2000", "percentage": "29.25%", "elapsed_time": "5h 40m 57s", "remaining_time": "13h 44m 42s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028596} {"loss": 0.00417897, "grad_norm": 0.6440667, "learning_rate": 8.4e-07, "reward": 2.1078125, "reward_std": 0.29008276, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.0859375, "rewards/MazeReward/std": 0.13166366, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.6875, "completions/min_length": 49.6, "completions/max_length": 760.4, "completions/clipped_ratio": 0.0015625, "kl": 0.10447121, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.04273504, "global_step/max_steps": "590/2000", "percentage": "29.50%", "elapsed_time": "5h 44m 34s", "remaining_time": "13h 43m 28s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028538} {"loss": 0.00439343, "grad_norm": 0.93903686, "learning_rate": 8.4e-07, "reward": 2.228125, "reward_std": 0.35290807, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.09781251, "rewards/MazeReward/std": 0.14086825, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.784375, "completions/min_length": 48.0, "completions/max_length": 363.2, "completions/clipped_ratio": 0.0, "kl": 0.10983695, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.08547009, "global_step/max_steps": "595/2000", "percentage": "29.75%", "elapsed_time": "5h 47m 24s", "remaining_time": "13h 40m 21s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028544} {"loss": 0.00427211, "grad_norm": 0.15068417, "learning_rate": 8.4e-07, "reward": 1.928125, "reward_std": 0.20876103, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.0678125, "rewards/MazeReward/std": 0.11261839, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.24375, "completions/min_length": 46.4, "completions/max_length": 436.0, "completions/clipped_ratio": 0.0, "kl": 0.10680327, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.12820513, "global_step/max_steps": "600/2000", "percentage": "30.00%", "elapsed_time": "5h 50m 22s", "remaining_time": "13h 37m 31s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028541} {"loss": 0.00404644, "grad_norm": 1.07402049, "learning_rate": 8.4e-07, "reward": 1.890625, "reward_std": 0.38107421, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.0640625, "rewards/MazeReward/std": 0.12369903, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.8765625, "completions/min_length": 49.6, "completions/max_length": 324.8, "completions/clipped_ratio": 0.0, "kl": 0.10116232, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.17094017, "global_step/max_steps": "605/2000", "percentage": "30.25%", "elapsed_time": "5h 54m 11s", "remaining_time": "13h 36m 41s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028469} {"loss": 0.00393662, "grad_norm": 1.37503848, "learning_rate": 8.3e-07, "reward": 1.9625, "reward_std": 0.45365602, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.07125, "rewards/MazeReward/std": 0.12817185, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.2015625, "completions/min_length": 47.6, "completions/max_length": 382.6, "completions/clipped_ratio": 0.0, "kl": 0.09841637, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.21367521, "global_step/max_steps": "610/2000", "percentage": "30.50%", "elapsed_time": "5h 57m 4s", "remaining_time": "13h 33m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028472} {"loss": 0.00365212, "grad_norm": 1.17236444, "learning_rate": 8.3e-07, "reward": 2.1125, "reward_std": 0.30008076, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.08625, "rewards/MazeReward/std": 0.1241547, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.0296875, "completions/min_length": 46.4, "completions/max_length": 367.4, "completions/clipped_ratio": 0.0, "kl": 0.09129949, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.25641026, "global_step/max_steps": "615/2000", "percentage": "30.75%", "elapsed_time": "5h 59m 54s", "remaining_time": "13h 30m 31s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028479} {"loss": 0.00377846, "grad_norm": 0.94974378, "learning_rate": 8.3e-07, "reward": 1.978125, "reward_std": 0.26701214, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.0728125, "rewards/MazeReward/std": 0.12639591, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.265625, "completions/min_length": 48.0, "completions/max_length": 360.8, "completions/clipped_ratio": 0.0, "kl": 0.09446963, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.2991453, "global_step/max_steps": "620/2000", "percentage": "31.00%", "elapsed_time": "6h 2m 44s", "remaining_time": "13h 27m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028486} {"loss": 0.00308664, "grad_norm": 0.51904455, "learning_rate": 8.2e-07, "reward": 2.36875, "reward_std": 0.16285934, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.111875, "rewards/MazeReward/std": 0.13475142, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.5703125, "completions/min_length": 48.6, "completions/max_length": 362.8, "completions/clipped_ratio": 0.0, "kl": 0.07716013, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.34188034, "global_step/max_steps": "625/2000", "percentage": "31.25%", "elapsed_time": "6h 5m 33s", "remaining_time": "13h 24m 13s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028495} {"loss": 0.00311706, "grad_norm": 0.09794822, "learning_rate": 8.2e-07, "reward": 1.925, "reward_std": 0.13193328, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0675, "rewards/MazeReward/std": 0.11543772, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 147.7828125, "completions/min_length": 50.4, "completions/max_length": 334.4, "completions/clipped_ratio": 0.0, "kl": 0.07791154, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.38461538, "global_step/max_steps": "630/2000", "percentage": "31.50%", "elapsed_time": "6h 8m 19s", "remaining_time": "13h 20m 57s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028507} {"loss": 0.00352298, "grad_norm": 1.11491401, "learning_rate": 8.2e-07, "reward": 2.3625, "reward_std": 0.36211257, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.11125, "rewards/MazeReward/std": 0.14898888, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.090625, "completions/min_length": 50.0, "completions/max_length": 445.4, "completions/clipped_ratio": 0.0, "kl": 0.08806669, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.42735043, "global_step/max_steps": "635/2000", "percentage": "31.75%", "elapsed_time": "6h 11m 19s", "remaining_time": "13h 18m 11s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028502} {"loss": 0.00439495, "grad_norm": 0.66598394, "learning_rate": 8.1e-07, "reward": 2.228125, "reward_std": 0.26746953, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.0978125, "rewards/MazeReward/std": 0.14412248, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.378125, "completions/min_length": 49.2, "completions/max_length": 373.2, "completions/clipped_ratio": 0.0, "kl": 0.10985708, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.47008547, "global_step/max_steps": "640/2000", "percentage": "32.00%", "elapsed_time": "6h 14m 12s", "remaining_time": "13h 15m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028504} {"loss": 0.00433344, "grad_norm": 0.95658921, "learning_rate": 8.1e-07, "reward": 2.096875, "reward_std": 0.37645112, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.0846875, "rewards/MazeReward/std": 0.13841112, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.1984375, "completions/min_length": 48.8, "completions/max_length": 334.0, "completions/clipped_ratio": 0.0, "kl": 0.10833389, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.51282051, "global_step/max_steps": "645/2000", "percentage": "32.25%", "elapsed_time": "6h 16m 58s", "remaining_time": "13h 11m 56s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028516} {"loss": 0.0047356, "grad_norm": 0.82262244, "learning_rate": 8.1e-07, "reward": 1.909375, "reward_std": 0.37183195, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.0659375, "rewards/MazeReward/std": 0.13234255, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.625, "completions/min_length": 49.4, "completions/max_length": 319.4, "completions/clipped_ratio": 0.0, "kl": 0.1184014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.55555556, "global_step/max_steps": "650/2000", "percentage": "32.50%", "elapsed_time": "6h 19m 43s", "remaining_time": "13h 8m 40s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028529} {"loss": 0.00495661, "grad_norm": 0.56363746, "learning_rate": 8e-07, "reward": 2.240625, "reward_std": 0.44010424, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.0990625, "rewards/MazeReward/std": 0.15114952, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 107.1859375, "completions/min_length": 43.4, "completions/max_length": 323.4, "completions/clipped_ratio": 0.0, "kl": 0.12389737, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.5982906, "global_step/max_steps": "655/2000", "percentage": "32.75%", "elapsed_time": "6h 22m 29s", "remaining_time": "13h 5m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028542} {"loss": 0.00456361, "grad_norm": 0.1061651, "learning_rate": 8e-07, "reward": 2.18125, "reward_std": 0.41402109, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.093125, "rewards/MazeReward/std": 0.14360718, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.5671875, "completions/min_length": 48.4, "completions/max_length": 273.4, "completions/clipped_ratio": 0.0, "kl": 0.11408241, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.64102564, "global_step/max_steps": "660/2000", "percentage": "33.00%", "elapsed_time": "6h 25m 7s", "remaining_time": "13h 1m 54s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028563} {"loss": 0.00429576, "grad_norm": 1.08030982, "learning_rate": 8e-07, "reward": 2.375, "reward_std": 0.34318867, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.11250001, "rewards/MazeReward/std": 0.14831583, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 112.2515625, "completions/min_length": 43.4, "completions/max_length": 307.4, "completions/clipped_ratio": 0.0, "kl": 0.10736344, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.68376068, "global_step/max_steps": "665/2000", "percentage": "33.25%", "elapsed_time": "6h 27m 52s", "remaining_time": "12h 58m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028575} {"loss": 0.00376309, "grad_norm": 1.05431093, "learning_rate": 7.9e-07, "reward": 2.4296875, "reward_std": 0.40401407, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.118125, "rewards/MazeReward/std": 0.15680089, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.2609375, "completions/min_length": 54.0, "completions/max_length": 307.0, "completions/clipped_ratio": 0.0, "kl": 0.09406611, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.72649573, "global_step/max_steps": "670/2000", "percentage": "33.50%", "elapsed_time": "6h 30m 37s", "remaining_time": "12h 55m 25s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028587} {"loss": 0.00362314, "grad_norm": 1.0450826, "learning_rate": 7.9e-07, "reward": 2.32167969, "reward_std": 0.44570921, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1071875, "rewards/MazeReward/std": 0.15488833, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 136.3890625, "completions/min_length": 54.8, "completions/max_length": 348.0, "completions/clipped_ratio": 0.0, "kl": 0.09056175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.76923077, "global_step/max_steps": "675/2000", "percentage": "33.75%", "elapsed_time": "6h 33m 23s", "remaining_time": "12h 52m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028598} {"loss": 0.00366852, "grad_norm": 0.98862032, "learning_rate": 7.9e-07, "reward": 2.475, "reward_std": 0.5890827, "frac_reward_zero_std": 0.6875, "rewards/MazeReward/mean": 0.1228125, "rewards/MazeReward/std": 0.16487362, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535534, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.19375, "completions/min_length": 53.2, "completions/max_length": 408.2, "completions/clipped_ratio": 0.0, "kl": 0.09170095, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.81196581, "global_step/max_steps": "680/2000", "percentage": "34.00%", "elapsed_time": "6h 36m 17s", "remaining_time": "12h 49m 16s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028598} {"loss": 0.00462695, "grad_norm": 0.910429, "learning_rate": 7.8e-07, "reward": 2.3546875, "reward_std": 0.61176647, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.110625, "rewards/MazeReward/std": 0.16645533, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.44375, "completions/min_length": 47.8, "completions/max_length": 739.8, "completions/clipped_ratio": 0.0015625, "kl": 0.11567545, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.85470085, "global_step/max_steps": "685/2000", "percentage": "34.25%", "elapsed_time": "6h 39m 50s", "remaining_time": "12h 47m 35s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028553} {"loss": 0.00490649, "grad_norm": 0.72267885, "learning_rate": 7.8e-07, "reward": 2.10917969, "reward_std": 0.35595683, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.08625, "rewards/MazeReward/std": 0.1403069, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535534, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 112.5953125, "completions/min_length": 46.4, "completions/max_length": 679.4, "completions/clipped_ratio": 0.0015625, "kl": 0.12265814, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.8974359, "global_step/max_steps": "690/2000", "percentage": "34.50%", "elapsed_time": "6h 43m 18s", "remaining_time": "12h 45m 42s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028514} {"loss": 0.00484369, "grad_norm": 0.84595055, "learning_rate": 7.8e-07, "reward": 2.2890625, "reward_std": 0.50482999, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1040625, "rewards/MazeReward/std": 0.15604965, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.75, "completions/min_length": 48.6, "completions/max_length": 640.8, "completions/clipped_ratio": 0.0015625, "kl": 0.12108418, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.94017094, "global_step/max_steps": "695/2000", "percentage": "34.75%", "elapsed_time": "6h 46m 39s", "remaining_time": "12h 43m 34s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028484} {"loss": 0.00536524, "grad_norm": 1.1244006, "learning_rate": 7.7e-07, "reward": 2.04667969, "reward_std": 0.47798594, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.0803125, "rewards/MazeReward/std": 0.14358068, "rewards/MazeFormat/mean": 0.99375, "rewards/MazeFormat/std": 0.07071068, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 126.9140625, "completions/min_length": 50.8, "completions/max_length": 1695.6, "completions/clipped_ratio": 0.00625, "kl": 0.13407599, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.98290598, "global_step/max_steps": "700/2000", "percentage": "35.00%", "elapsed_time": "6h 52m 2s", "remaining_time": "12h 45m 13s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028314} {"loss": 0.00539926, "grad_norm": 1.83474464, "learning_rate": 7.7e-07, "reward": 2.359375, "reward_std": 0.60896248, "frac_reward_zero_std": 0.6875, "rewards/MazeReward/mean": 0.1109375, "rewards/MazeReward/std": 0.16333802, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.090625, "completions/min_length": 43.6, "completions/max_length": 320.6, "completions/clipped_ratio": 0.0, "kl": 0.13496986, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.02564103, "global_step/max_steps": "705/2000", "percentage": "35.25%", "elapsed_time": "6h 55m 55s", "remaining_time": "12h 44m 0s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02825} {"loss": 0.00543157, "grad_norm": 0.71198699, "learning_rate": 7.7e-07, "reward": 2.35, "reward_std": 0.17233722, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.11, "rewards/MazeReward/std": 0.14970978, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 106.2421875, "completions/min_length": 46.0, "completions/max_length": 288.6, "completions/clipped_ratio": 0.0, "kl": 0.13576708, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.06837607, "global_step/max_steps": "710/2000", "percentage": "35.50%", "elapsed_time": "6h 58m 38s", "remaining_time": "12h 40m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028266} {"loss": 0.00504921, "grad_norm": 1.35423566, "learning_rate": 7.6e-07, "reward": 2.453125, "reward_std": 0.39074876, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.12031251, "rewards/MazeReward/std": 0.15516621, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.7984375, "completions/min_length": 47.2, "completions/max_length": 324.0, "completions/clipped_ratio": 0.0, "kl": 0.1262269, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.11111111, "global_step/max_steps": "715/2000", "percentage": "35.75%", "elapsed_time": "7h 1m 22s", "remaining_time": "12h 37m 17s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028281} {"loss": 0.0044935, "grad_norm": 1.05130484, "learning_rate": 7.6e-07, "reward": 2.2828125, "reward_std": 0.45589269, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.10375, "rewards/MazeReward/std": 0.15448315, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257905, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.815625, "completions/min_length": 46.6, "completions/max_length": 1031.0, "completions/clipped_ratio": 0.003125, "kl": 0.11233654, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.15384615, "global_step/max_steps": "720/2000", "percentage": "36.00%", "elapsed_time": "7h 5m 35s", "remaining_time": "12h 36m 36s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028196} {"loss": 0.00485673, "grad_norm": 1.48518892, "learning_rate": 7.6e-07, "reward": 2.484375, "reward_std": 0.60578619, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.1234375, "rewards/MazeReward/std": 0.17167752, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 112.59375, "completions/min_length": 43.6, "completions/max_length": 316.4, "completions/clipped_ratio": 0.0, "kl": 0.12141172, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.1965812, "global_step/max_steps": "725/2000", "percentage": "36.25%", "elapsed_time": "7h 8m 22s", "remaining_time": "12h 33m 20s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028207} {"loss": 0.0048633, "grad_norm": 0.8669441, "learning_rate": 7.5e-07, "reward": 2.409375, "reward_std": 0.45396569, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1159375, "rewards/MazeReward/std": 0.16991186, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 107.7328125, "completions/min_length": 44.4, "completions/max_length": 272.8, "completions/clipped_ratio": 0.0, "kl": 0.12156374, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.23931624, "global_step/max_steps": "730/2000", "percentage": "36.50%", "elapsed_time": "7h 11m 2s", "remaining_time": "12h 29m 53s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028226} {"loss": 0.0050648, "grad_norm": 1.47220024, "learning_rate": 7.5e-07, "reward": 2.525, "reward_std": 0.38043125, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1275, "rewards/MazeReward/std": 0.16860854, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.7125, "completions/min_length": 42.0, "completions/max_length": 329.4, "completions/clipped_ratio": 0.0, "kl": 0.12660212, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.28205128, "global_step/max_steps": "735/2000", "percentage": "36.75%", "elapsed_time": "7h 13m 50s", "remaining_time": "12h 26m 40s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028236} {"loss": 0.00479762, "grad_norm": 0.55055025, "learning_rate": 7.5e-07, "reward": 2.06875, "reward_std": 0.20705547, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.081875, "rewards/MazeReward/std": 0.13211893, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 107.0140625, "completions/min_length": 44.2, "completions/max_length": 289.8, "completions/clipped_ratio": 0.0, "kl": 0.11993481, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.32478632, "global_step/max_steps": "740/2000", "percentage": "37.00%", "elapsed_time": "7h 16m 32s", "remaining_time": "12h 23m 17s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028253} {"loss": 0.0050775, "grad_norm": 0.53122501, "learning_rate": 7.4e-07, "reward": 2.1375, "reward_std": 0.28113912, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.08875, "rewards/MazeReward/std": 0.14652519, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 105.2875, "completions/min_length": 44.0, "completions/max_length": 288.6, "completions/clipped_ratio": 0.0, "kl": 0.12694653, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.36752137, "global_step/max_steps": "745/2000", "percentage": "37.25%", "elapsed_time": "7h 19m 13s", "remaining_time": "12h 19m 53s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02827} {"loss": 0.00476023, "grad_norm": 0.99923596, "learning_rate": 7.4e-07, "reward": 2.08125, "reward_std": 0.27081258, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.083125, "rewards/MazeReward/std": 0.13845863, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.621875, "completions/min_length": 39.0, "completions/max_length": 344.2, "completions/clipped_ratio": 0.0, "kl": 0.11899666, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.41025641, "global_step/max_steps": "750/2000", "percentage": "37.50%", "elapsed_time": "7h 22m 3s", "remaining_time": "12h 16m 45s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028277} {"loss": 0.00468179, "grad_norm": 1.18016944, "learning_rate": 7.3e-07, "reward": 2.040625, "reward_std": 0.28785119, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.0790625, "rewards/MazeReward/std": 0.13535543, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.45625, "completions/min_length": 44.2, "completions/max_length": 345.6, "completions/clipped_ratio": 0.0, "kl": 0.11703309, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.45299145, "global_step/max_steps": "755/2000", "percentage": "37.75%", "elapsed_time": "7h 24m 52s", "remaining_time": "12h 13m 35s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028285} {"loss": 0.00436868, "grad_norm": 1.09554147, "learning_rate": 7.3e-07, "reward": 2.4, "reward_std": 0.48418455, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.11500001, "rewards/MazeReward/std": 0.16087429, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.95625, "completions/min_length": 44.2, "completions/max_length": 342.0, "completions/clipped_ratio": 0.0, "kl": 0.10920556, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.4957265, "global_step/max_steps": "760/2000", "percentage": "38.00%", "elapsed_time": "7h 27m 40s", "remaining_time": "12h 10m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028295} {"loss": 0.00468215, "grad_norm": 0.55104778, "learning_rate": 7.3e-07, "reward": 2.315625, "reward_std": 0.30975441, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.10656251, "rewards/MazeReward/std": 0.15523205, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.575, "completions/min_length": 39.2, "completions/max_length": 357.4, "completions/clipped_ratio": 0.0, "kl": 0.11705516, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.53846154, "global_step/max_steps": "765/2000", "percentage": "38.25%", "elapsed_time": "7h 30m 31s", "remaining_time": "12h 7m 18s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028301} {"loss": 0.00398538, "grad_norm": 1.21026693, "learning_rate": 7.2e-07, "reward": 2.29375, "reward_std": 0.31227283, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.104375, "rewards/MazeReward/std": 0.15053761, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.95625, "completions/min_length": 46.2, "completions/max_length": 383.8, "completions/clipped_ratio": 0.0, "kl": 0.09963315, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.58119658, "global_step/max_steps": "770/2000", "percentage": "38.50%", "elapsed_time": "7h 33m 22s", "remaining_time": "12h 4m 13s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028306} {"loss": 0.00389398, "grad_norm": 0.78170608, "learning_rate": 7.2e-07, "reward": 2.3921875, "reward_std": 0.52900789, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.114375, "rewards/MazeReward/std": 0.16184994, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 139.1375, "completions/min_length": 44.4, "completions/max_length": 706.8, "completions/clipped_ratio": 0.0015625, "kl": 0.09734828, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.62393162, "global_step/max_steps": "775/2000", "percentage": "38.75%", "elapsed_time": "7h 36m 52s", "remaining_time": "12h 2m 9s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028272} {"loss": 0.00392622, "grad_norm": 0.09525023, "learning_rate": 7.2e-07, "reward": 2.28125, "reward_std": 0.32396026, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.103125, "rewards/MazeReward/std": 0.14976677, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.196875, "completions/min_length": 43.2, "completions/max_length": 370.6, "completions/clipped_ratio": 0.0, "kl": 0.09814363, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.66666667, "global_step/max_steps": "780/2000", "percentage": "39.00%", "elapsed_time": "7h 39m 42s", "remaining_time": "11h 59m 2s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028279} {"loss": 0.00413545, "grad_norm": 0.8633826, "learning_rate": 7.1e-07, "reward": 2.353125, "reward_std": 0.29626257, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.1103125, "rewards/MazeReward/std": 0.14826813, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.6578125, "completions/min_length": 48.4, "completions/max_length": 360.0, "completions/clipped_ratio": 0.0, "kl": 0.10337876, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.70940171, "global_step/max_steps": "785/2000", "percentage": "39.25%", "elapsed_time": "7h 42m 32s", "remaining_time": "11h 55m 54s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028285} {"loss": 0.00465489, "grad_norm": 0.89943613, "learning_rate": 7.1e-07, "reward": 2.284375, "reward_std": 0.15025, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.1034375, "rewards/MazeReward/std": 0.1475174, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.65625, "completions/min_length": 47.2, "completions/max_length": 382.8, "completions/clipped_ratio": 0.0, "kl": 0.1163611, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.75213675, "global_step/max_steps": "790/2000", "percentage": "39.50%", "elapsed_time": "7h 45m 25s", "remaining_time": "11h 52m 52s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028289} {"loss": 0.00489183, "grad_norm": 1.31065592, "learning_rate": 7e-07, "reward": 2.21875, "reward_std": 0.43136625, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.096875, "rewards/MazeReward/std": 0.16106954, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.7375, "completions/min_length": 44.4, "completions/max_length": 348.4, "completions/clipped_ratio": 0.0, "kl": 0.1223057, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.79487179, "global_step/max_steps": "795/2000", "percentage": "39.75%", "elapsed_time": "7h 48m 16s", "remaining_time": "11h 49m 45s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028296} {"loss": 0.00490034, "grad_norm": 1.53343346, "learning_rate": 7e-07, "reward": 2.59375, "reward_std": 0.59866194, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.13437501, "rewards/MazeReward/std": 0.17435938, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.8796875, "completions/min_length": 44.0, "completions/max_length": 291.6, "completions/clipped_ratio": 0.0, "kl": 0.1225133, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.83760684, "global_step/max_steps": "800/2000", "percentage": "40.00%", "elapsed_time": "7h 50m 55s", "remaining_time": "11h 46m 22s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028313} {"loss": 0.00499131, "grad_norm": 0.84350435, "learning_rate": 7e-07, "reward": 2.4609375, "reward_std": 0.32227788, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.12125, "rewards/MazeReward/std": 0.16714947, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.9640625, "completions/min_length": 44.0, "completions/max_length": 297.0, "completions/clipped_ratio": 0.0, "kl": 0.12477835, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.88034188, "global_step/max_steps": "805/2000", "percentage": "40.25%", "elapsed_time": "7h 54m 48s", "remaining_time": "11h 44m 50s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028257} {"loss": 0.00467074, "grad_norm": 1.00457866, "learning_rate": 6.9e-07, "reward": 2.36875, "reward_std": 0.1768634, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.11187501, "rewards/MazeReward/std": 0.1532081, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 109.0140625, "completions/min_length": 47.4, "completions/max_length": 320.2, "completions/clipped_ratio": 0.0, "kl": 0.11675225, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.92307692, "global_step/max_steps": "810/2000", "percentage": "40.50%", "elapsed_time": "7h 57m 32s", "remaining_time": "11h 41m 35s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028269} {"loss": 0.00478288, "grad_norm": 1.28987642, "learning_rate": 6.9e-07, "reward": 2.346875, "reward_std": 0.53534248, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.1096875, "rewards/MazeReward/std": 0.16762269, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.41875, "completions/min_length": 46.2, "completions/max_length": 355.6, "completions/clipped_ratio": 0.0, "kl": 0.11954717, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.96581197, "global_step/max_steps": "815/2000", "percentage": "40.75%", "elapsed_time": "8h 0m 26s", "remaining_time": "11h 38m 32s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028273} {"loss": 0.00467019, "grad_norm": 0.46566543, "learning_rate": 6.9e-07, "reward": 2.28886719, "reward_std": 0.41036257, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.1040625, "rewards/MazeReward/std": 0.16578746, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 116.9609375, "completions/min_length": 43.8, "completions/max_length": 320.8, "completions/clipped_ratio": 0.0, "kl": 0.11674827, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.00854701, "global_step/max_steps": "820/2000", "percentage": "41.00%", "elapsed_time": "8h 3m 13s", "remaining_time": "11h 35m 22s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028282} {"loss": 0.00523757, "grad_norm": 0.91403278, "learning_rate": 6.8e-07, "reward": 2.3, "reward_std": 0.24220315, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.105, "rewards/MazeReward/std": 0.15769463, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 104.7578125, "completions/min_length": 43.6, "completions/max_length": 298.8, "completions/clipped_ratio": 0.0, "kl": 0.1309411, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.05128205, "global_step/max_steps": "825/2000", "percentage": "41.25%", "elapsed_time": "8h 5m 56s", "remaining_time": "11h 32m 5s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028296} {"loss": 0.00504199, "grad_norm": 0.94175006, "learning_rate": 6.8e-07, "reward": 2.396875, "reward_std": 0.32111557, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1146875, "rewards/MazeReward/std": 0.14987296, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 106.153125, "completions/min_length": 44.2, "completions/max_length": 327.4, "completions/clipped_ratio": 0.0, "kl": 0.12604689, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.09401709, "global_step/max_steps": "830/2000", "percentage": "41.50%", "elapsed_time": "8h 8m 42s", "remaining_time": "11h 28m 53s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028306} {"loss": 0.00501505, "grad_norm": 0.52867001, "learning_rate": 6.7e-07, "reward": 2.41875, "reward_std": 0.34288726, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.116875, "rewards/MazeReward/std": 0.17734502, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.1421875, "completions/min_length": 42.2, "completions/max_length": 307.4, "completions/clipped_ratio": 0.0, "kl": 0.12536691, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.13675214, "global_step/max_steps": "835/2000", "percentage": "41.75%", "elapsed_time": "8h 11m 26s", "remaining_time": "11h 25m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028318} {"loss": 0.00426045, "grad_norm": 0.74039599, "learning_rate": 6.7e-07, "reward": 2.509375, "reward_std": 0.5740606, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1259375, "rewards/MazeReward/std": 0.16053207, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.98125, "completions/min_length": 45.8, "completions/max_length": 366.6, "completions/clipped_ratio": 0.0, "kl": 0.10649481, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.17948718, "global_step/max_steps": "840/2000", "percentage": "42.00%", "elapsed_time": "8h 14m 19s", "remaining_time": "11h 22m 38s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028322} {"loss": 0.00419768, "grad_norm": 0.88372135, "learning_rate": 6.7e-07, "reward": 2.371875, "reward_std": 0.32826977, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1121875, "rewards/MazeReward/std": 0.15610672, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.059375, "completions/min_length": 42.0, "completions/max_length": 309.0, "completions/clipped_ratio": 0.0, "kl": 0.10494112, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.22222222, "global_step/max_steps": "845/2000", "percentage": "42.25%", "elapsed_time": "8h 17m 4s", "remaining_time": "11h 19m 26s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028332} {"loss": 0.00440416, "grad_norm": 0.94042322, "learning_rate": 6.6e-07, "reward": 2.63261719, "reward_std": 0.63320066, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.1384375, "rewards/MazeReward/std": 0.18257659, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 122.8390625, "completions/min_length": 46.0, "completions/max_length": 378.2, "completions/clipped_ratio": 0.0, "kl": 0.11009908, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.26495726, "global_step/max_steps": "850/2000", "percentage": "42.50%", "elapsed_time": "8h 19m 56s", "remaining_time": "11h 16m 23s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028336} {"loss": 0.00483029, "grad_norm": 0.88428687, "learning_rate": 6.6e-07, "reward": 2.678125, "reward_std": 0.2256697, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.1428125, "rewards/MazeReward/std": 0.15798467, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.6125, "completions/min_length": 46.2, "completions/max_length": 301.6, "completions/clipped_ratio": 0.0, "kl": 0.12074682, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.30769231, "global_step/max_steps": "855/2000", "percentage": "42.75%", "elapsed_time": "8h 22m 39s", "remaining_time": "11h 13m 9s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028349} {"loss": 0.00520564, "grad_norm": 0.12253567, "learning_rate": 6.5e-07, "reward": 2.134375, "reward_std": 0.07553947, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.08843751, "rewards/MazeReward/std": 0.13382319, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 111.078125, "completions/min_length": 44.8, "completions/max_length": 321.6, "completions/clipped_ratio": 0.0, "kl": 0.13014833, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.35042735, "global_step/max_steps": "860/2000", "percentage": "43.00%", "elapsed_time": "8h 25m 25s", "remaining_time": "11h 9m 59s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028359} {"loss": 0.00479661, "grad_norm": 0.98282542, "learning_rate": 6.5e-07, "reward": 2.08125, "reward_std": 0.081854, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.083125, "rewards/MazeReward/std": 0.12961902, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.615625, "completions/min_length": 43.4, "completions/max_length": 369.6, "completions/clipped_ratio": 0.0, "kl": 0.11990116, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.39316239, "global_step/max_steps": "865/2000", "percentage": "43.25%", "elapsed_time": "8h 28m 16s", "remaining_time": "11h 6m 55s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028364} {"loss": 0.00469642, "grad_norm": 1.56415407, "learning_rate": 6.5e-07, "reward": 2.278125, "reward_std": 0.37824142, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.1028125, "rewards/MazeReward/std": 0.16968013, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.7828125, "completions/min_length": 45.8, "completions/max_length": 427.8, "completions/clipped_ratio": 0.0, "kl": 0.11739974, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.43589744, "global_step/max_steps": "870/2000", "percentage": "43.50%", "elapsed_time": "8h 31m 13s", "remaining_time": "11h 4m 0s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028363} {"loss": 0.00419211, "grad_norm": 0.98798689, "learning_rate": 6.4e-07, "reward": 2.690625, "reward_std": 0.39118526, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.1440625, "rewards/MazeReward/std": 0.17793795, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.378125, "completions/min_length": 48.2, "completions/max_length": 407.8, "completions/clipped_ratio": 0.0, "kl": 0.10477359, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.47863248, "global_step/max_steps": "875/2000", "percentage": "43.75%", "elapsed_time": "8h 34m 8s", "remaining_time": "11h 1m 2s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028365} {"loss": 0.00388349, "grad_norm": 1.2740448, "learning_rate": 6.4e-07, "reward": 2.121875, "reward_std": 0.42704968, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.0871875, "rewards/MazeReward/std": 0.1474171, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.2828125, "completions/min_length": 48.0, "completions/max_length": 352.6, "completions/clipped_ratio": 0.0, "kl": 0.09708791, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.52136752, "global_step/max_steps": "880/2000", "percentage": "44.00%", "elapsed_time": "8h 36m 56s", "remaining_time": "10h 57m 55s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028372} {"loss": 0.00437901, "grad_norm": 0.88167445, "learning_rate": 6.3e-07, "reward": 2.521875, "reward_std": 0.4934528, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1271875, "rewards/MazeReward/std": 0.18064942, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.08125, "completions/min_length": 48.2, "completions/max_length": 338.0, "completions/clipped_ratio": 0.0, "kl": 0.10946231, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.56410256, "global_step/max_steps": "885/2000", "percentage": "44.25%", "elapsed_time": "8h 39m 44s", "remaining_time": "10h 54m 48s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02838} {"loss": 0.00435116, "grad_norm": 0.79042468, "learning_rate": 6.3e-07, "reward": 2.3984375, "reward_std": 0.4378116, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.115, "rewards/MazeReward/std": 0.16837832, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.7390625, "completions/min_length": 46.4, "completions/max_length": 357.2, "completions/clipped_ratio": 0.0, "kl": 0.10876898, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.60683761, "global_step/max_steps": "890/2000", "percentage": "44.50%", "elapsed_time": "8h 42m 33s", "remaining_time": "10h 51m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028386} {"loss": 0.00472467, "grad_norm": 0.68316001, "learning_rate": 6.3e-07, "reward": 2.159375, "reward_std": 0.25841794, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.0909375, "rewards/MazeReward/std": 0.15757765, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.16875, "completions/min_length": 42.2, "completions/max_length": 394.4, "completions/clipped_ratio": 0.0, "kl": 0.11811228, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.64957265, "global_step/max_steps": "895/2000", "percentage": "44.75%", "elapsed_time": "8h 45m 27s", "remaining_time": "10h 48m 45s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028388} {"loss": 0.00487825, "grad_norm": 0.9463457, "learning_rate": 6.2e-07, "reward": 2.659375, "reward_std": 0.50902855, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1409375, "rewards/MazeReward/std": 0.18577449, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.315625, "completions/min_length": 43.4, "completions/max_length": 392.4, "completions/clipped_ratio": 0.0, "kl": 0.12197577, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.69230769, "global_step/max_steps": "900/2000", "percentage": "45.00%", "elapsed_time": "8h 48m 21s", "remaining_time": "10h 45m 46s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028389} {"loss": 0.0047689, "grad_norm": 0.9935293, "learning_rate": 6.2e-07, "reward": 2.33710937, "reward_std": 0.40407248, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.10875001, "rewards/MazeReward/std": 0.17144443, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.24960937, "rewards/Format/std": 0.00441942, "completions/mean_length": 124.259375, "completions/min_length": 47.0, "completions/max_length": 310.4, "completions/clipped_ratio": 0.0, "kl": 0.11919824, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.73504274, "global_step/max_steps": "905/2000", "percentage": "45.25%", "elapsed_time": "8h 52m 13s", "remaining_time": "10h 43m 58s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02834} {"loss": 0.00427965, "grad_norm": 0.42737861, "learning_rate": 6.1e-07, "reward": 2.640625, "reward_std": 0.54690897, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1390625, "rewards/MazeReward/std": 0.17976904, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.9703125, "completions/min_length": 48.8, "completions/max_length": 401.8, "completions/clipped_ratio": 0.0, "kl": 0.10699953, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.77777778, "global_step/max_steps": "910/2000", "percentage": "45.50%", "elapsed_time": "8h 55m 6s", "remaining_time": "10h 40m 57s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028343} {"loss": 0.00394656, "grad_norm": 1.11436477, "learning_rate": 6.1e-07, "reward": 2.365625, "reward_std": 0.53135021, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1115625, "rewards/MazeReward/std": 0.17100095, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.7734375, "completions/min_length": 47.8, "completions/max_length": 294.8, "completions/clipped_ratio": 0.0, "kl": 0.09865479, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.82051282, "global_step/max_steps": "915/2000", "percentage": "45.75%", "elapsed_time": "8h 57m 50s", "remaining_time": "10h 37m 45s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028354} {"loss": 0.00383287, "grad_norm": 1.1322236, "learning_rate": 6.1e-07, "reward": 2.29375, "reward_std": 0.39656127, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.104375, "rewards/MazeReward/std": 0.15971106, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 137.840625, "completions/min_length": 49.6, "completions/max_length": 340.0, "completions/clipped_ratio": 0.0, "kl": 0.09581782, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.86324786, "global_step/max_steps": "920/2000", "percentage": "46.00%", "elapsed_time": "9h 0m 36s", "remaining_time": "10h 34m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028363} {"loss": 0.00370415, "grad_norm": 0.90331913, "learning_rate": 6e-07, "reward": 2.58574219, "reward_std": 0.52041182, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.13375, "rewards/MazeReward/std": 0.17141991, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 137.2859375, "completions/min_length": 51.6, "completions/max_length": 350.4, "completions/clipped_ratio": 0.0, "kl": 0.09261847, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.90598291, "global_step/max_steps": "925/2000", "percentage": "46.25%", "elapsed_time": "9h 3m 25s", "remaining_time": "10h 31m 33s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028369} {"loss": 0.00425707, "grad_norm": 1.10268411, "learning_rate": 6e-07, "reward": 2.884375, "reward_std": 0.628828, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.16343751, "rewards/MazeReward/std": 0.18413116, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.3171875, "completions/min_length": 46.4, "completions/max_length": 341.0, "completions/clipped_ratio": 0.0, "kl": 0.10645872, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.94871795, "global_step/max_steps": "930/2000", "percentage": "46.50%", "elapsed_time": "9h 6m 13s", "remaining_time": "10h 28m 27s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028377} {"loss": 0.00425869, "grad_norm": 1.03199875, "learning_rate": 5.9e-07, "reward": 2.790625, "reward_std": 0.34971794, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.1540625, "rewards/MazeReward/std": 0.18062106, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.8984375, "completions/min_length": 45.8, "completions/max_length": 370.6, "completions/clipped_ratio": 0.0, "kl": 0.1064688, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.99145299, "global_step/max_steps": "935/2000", "percentage": "46.75%", "elapsed_time": "9h 9m 5s", "remaining_time": "10h 25m 26s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02838} {"loss": 0.00418409, "grad_norm": 1.03741925, "learning_rate": 5.9e-07, "reward": 2.4390625, "reward_std": 0.53301597, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1190625, "rewards/MazeReward/std": 0.17639966, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.0609375, "completions/min_length": 44.8, "completions/max_length": 452.6, "completions/clipped_ratio": 0.0, "kl": 0.10459753, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.03418803, "global_step/max_steps": "940/2000", "percentage": "47.00%", "elapsed_time": "9h 12m 7s", "remaining_time": "10h 22m 36s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028375} {"loss": 0.0043811, "grad_norm": 1.09149151, "learning_rate": 5.9e-07, "reward": 2.49824219, "reward_std": 0.39825394, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.125, "rewards/MazeReward/std": 0.17814156, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 133.3921875, "completions/min_length": 52.2, "completions/max_length": 391.6, "completions/clipped_ratio": 0.0, "kl": 0.1095439, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.07692308, "global_step/max_steps": "945/2000", "percentage": "47.25%", "elapsed_time": "9h 15m 1s", "remaining_time": "10h 19m 38s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028377} {"loss": 0.00411823, "grad_norm": 0.99906118, "learning_rate": 5.8e-07, "reward": 2.390625, "reward_std": 0.49241425, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.1140625, "rewards/MazeReward/std": 0.16960946, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.075, "completions/min_length": 50.8, "completions/max_length": 370.0, "completions/clipped_ratio": 0.0, "kl": 0.10294656, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.11965812, "global_step/max_steps": "950/2000", "percentage": "47.50%", "elapsed_time": "9h 17m 54s", "remaining_time": "10h 16m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02838} {"loss": 0.00456087, "grad_norm": 0.50134222, "learning_rate": 5.8e-07, "reward": 2.165625, "reward_std": 0.32722049, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.0915625, "rewards/MazeReward/std": 0.15920471, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.9890625, "completions/min_length": 45.4, "completions/max_length": 392.2, "completions/clipped_ratio": 0.0, "kl": 0.11401241, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.16239316, "global_step/max_steps": "955/2000", "percentage": "47.75%", "elapsed_time": "9h 20m 47s", "remaining_time": "10h 13m 38s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028382} {"loss": 0.00417058, "grad_norm": 0.90856888, "learning_rate": 5.7e-07, "reward": 2.45, "reward_std": 0.31109243, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1203125, "rewards/MazeReward/std": 0.17526073, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490138, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 139.0234375, "completions/min_length": 48.4, "completions/max_length": 785.4, "completions/clipped_ratio": 0.003125, "kl": 0.10424432, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.20512821, "global_step/max_steps": "960/2000", "percentage": "48.00%", "elapsed_time": "9h 24m 28s", "remaining_time": "10h 11m 31s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028345} {"loss": 0.00458899, "grad_norm": 0.42878494, "learning_rate": 5.7e-07, "reward": 2.875, "reward_std": 0.42599531, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.1625, "rewards/MazeReward/std": 0.19694378, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.1703125, "completions/min_length": 45.6, "completions/max_length": 381.8, "completions/clipped_ratio": 0.0, "kl": 0.11473486, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.24786325, "global_step/max_steps": "965/2000", "percentage": "48.25%", "elapsed_time": "9h 27m 21s", "remaining_time": "10h 8m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028348} {"loss": 0.00443957, "grad_norm": 0.46980088, "learning_rate": 5.7e-07, "reward": 2.678125, "reward_std": 0.3561488, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.143125, "rewards/MazeReward/std": 0.17075004, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535534, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.865625, "completions/min_length": 44.8, "completions/max_length": 1040.4, "completions/clipped_ratio": 0.003125, "kl": 0.11099314, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.29059829, "global_step/max_steps": "970/2000", "percentage": "48.50%", "elapsed_time": "9h 31m 30s", "remaining_time": "10h 6m 51s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028288} {"loss": 0.0043874, "grad_norm": 1.12468144, "learning_rate": 5.6e-07, "reward": 2.5203125, "reward_std": 0.47587602, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1271875, "rewards/MazeReward/std": 0.17132539, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.553125, "completions/min_length": 44.4, "completions/max_length": 679.6, "completions/clipped_ratio": 0.0015625, "kl": 0.10966736, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.33333333, "global_step/max_steps": "975/2000", "percentage": "48.75%", "elapsed_time": "9h 34m 55s", "remaining_time": "10h 4m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028265} {"loss": 0.00482525, "grad_norm": 0.93081154, "learning_rate": 5.6e-07, "reward": 2.5078125, "reward_std": 0.48774629, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1259375, "rewards/MazeReward/std": 0.17260953, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.0546875, "completions/min_length": 45.0, "completions/max_length": 643.6, "completions/clipped_ratio": 0.0015625, "kl": 0.12060629, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.37606838, "global_step/max_steps": "980/2000", "percentage": "49.00%", "elapsed_time": "9h 38m 20s", "remaining_time": "10h 1m 56s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028242} {"loss": 0.00509611, "grad_norm": 0.45122354, "learning_rate": 5.5e-07, "reward": 2.70761719, "reward_std": 0.27505553, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.1459375, "rewards/MazeReward/std": 0.17079398, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 103.9421875, "completions/min_length": 40.8, "completions/max_length": 306.0, "completions/clipped_ratio": 0.0, "kl": 0.12739774, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.41880342, "global_step/max_steps": "985/2000", "percentage": "49.25%", "elapsed_time": "9h 41m 2s", "remaining_time": "9h 58m 44s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028254} {"loss": 0.00535293, "grad_norm": 0.51765524, "learning_rate": 5.5e-07, "reward": 2.628125, "reward_std": 0.23545527, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.1378125, "rewards/MazeReward/std": 0.18680999, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.8203125, "completions/min_length": 44.6, "completions/max_length": 326.4, "completions/clipped_ratio": 0.0, "kl": 0.13383744, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.46153846, "global_step/max_steps": "990/2000", "percentage": "49.50%", "elapsed_time": "9h 43m 46s", "remaining_time": "9h 55m 33s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028265} {"loss": 0.00510081, "grad_norm": 0.70400392, "learning_rate": 5.5e-07, "reward": 2.84375, "reward_std": 0.34772194, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.159375, "rewards/MazeReward/std": 0.20540504, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.4109375, "completions/min_length": 46.4, "completions/max_length": 337.4, "completions/clipped_ratio": 0.0, "kl": 0.1275129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.5042735, "global_step/max_steps": "995/2000", "percentage": "49.75%", "elapsed_time": "9h 46m 34s", "remaining_time": "9h 52m 28s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028272} {"loss": 0.00459881, "grad_norm": 0.48559915, "learning_rate": 5.4e-07, "reward": 2.396875, "reward_std": 0.29418938, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.1146875, "rewards/MazeReward/std": 0.15751169, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.3171875, "completions/min_length": 44.6, "completions/max_length": 402.8, "completions/clipped_ratio": 0.0, "kl": 0.11497611, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.54700855, "global_step/max_steps": "1000/2000", "percentage": "50.00%", "elapsed_time": "9h 49m 29s", "remaining_time": "9h 49m 29s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028273} {"loss": 0.00456762, "grad_norm": 1.19498497, "learning_rate": 5.4e-07, "reward": 2.446875, "reward_std": 0.50553777, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1196875, "rewards/MazeReward/std": 0.17761976, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.1109375, "completions/min_length": 48.8, "completions/max_length": 381.6, "completions/clipped_ratio": 0.0, "kl": 0.11416346, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.58974359, "global_step/max_steps": "1005/2000", "percentage": "50.25%", "elapsed_time": "9h 53m 31s", "remaining_time": "9h 47m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028221} {"loss": 0.00425075, "grad_norm": 0.82302264, "learning_rate": 5.3e-07, "reward": 2.765625, "reward_std": 0.51984248, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1515625, "rewards/MazeReward/std": 0.19108597, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.553125, "completions/min_length": 49.6, "completions/max_length": 384.0, "completions/clipped_ratio": 0.0, "kl": 0.10625858, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.63247863, "global_step/max_steps": "1010/2000", "percentage": "50.50%", "elapsed_time": "9h 56m 20s", "remaining_time": "9h 44m 32s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028227} {"loss": 0.00396508, "grad_norm": 0.68345639, "learning_rate": 5.3e-07, "reward": 2.821875, "reward_std": 0.46735044, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.15718751, "rewards/MazeReward/std": 0.19530396, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.9265625, "completions/min_length": 48.0, "completions/max_length": 403.0, "completions/clipped_ratio": 0.0, "kl": 0.09912158, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.67521368, "global_step/max_steps": "1015/2000", "percentage": "50.75%", "elapsed_time": "9h 59m 17s", "remaining_time": "9h 41m 34s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028228} {"loss": 0.00416225, "grad_norm": 0.75552652, "learning_rate": 5.2e-07, "reward": 2.9375, "reward_std": 0.24535828, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.16875, "rewards/MazeReward/std": 0.1709858, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.9421875, "completions/min_length": 47.4, "completions/max_length": 369.2, "completions/clipped_ratio": 0.0, "kl": 0.10403794, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.71794872, "global_step/max_steps": "1020/2000", "percentage": "51.00%", "elapsed_time": "10h 2m 6s", "remaining_time": "9h 38m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028234} {"loss": 0.00408639, "grad_norm": 0.52372482, "learning_rate": 5.2e-07, "reward": 2.653125, "reward_std": 0.39622445, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.14031251, "rewards/MazeReward/std": 0.18336016, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.3453125, "completions/min_length": 48.2, "completions/max_length": 445.0, "completions/clipped_ratio": 0.0, "kl": 0.10214885, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.76068376, "global_step/max_steps": "1025/2000", "percentage": "51.25%", "elapsed_time": "10h 5m 4s", "remaining_time": "9h 35m 33s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028233} {"loss": 0.00364055, "grad_norm": 0.77064145, "learning_rate": 5.2e-07, "reward": 2.846875, "reward_std": 0.54511474, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1596875, "rewards/MazeReward/std": 0.19166248, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 144.084375, "completions/min_length": 46.2, "completions/max_length": 376.0, "completions/clipped_ratio": 0.0, "kl": 0.09099416, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.8034188, "global_step/max_steps": "1030/2000", "percentage": "51.50%", "elapsed_time": "10h 7m 54s", "remaining_time": "9h 32m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028239} {"loss": 0.00368196, "grad_norm": 0.72612577, "learning_rate": 5.1e-07, "reward": 2.78125, "reward_std": 0.21841665, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.15312501, "rewards/MazeReward/std": 0.19261255, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 151.68125, "completions/min_length": 53.8, "completions/max_length": 493.6, "completions/clipped_ratio": 0.0, "kl": 0.09205409, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.84615385, "global_step/max_steps": "1035/2000", "percentage": "51.75%", "elapsed_time": "10h 10m 59s", "remaining_time": "9h 29m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028233} {"loss": 0.0038292, "grad_norm": 1.2739756, "learning_rate": 5.1e-07, "reward": 2.66230469, "reward_std": 0.6662475, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.14125, "rewards/MazeReward/std": 0.21111677, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 142.8265625, "completions/min_length": 50.0, "completions/max_length": 362.6, "completions/clipped_ratio": 0.0, "kl": 0.09573276, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.88888889, "global_step/max_steps": "1040/2000", "percentage": "52.00%", "elapsed_time": "10h 13m 48s", "remaining_time": "9h 26m 35s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028239} {"loss": 0.00352369, "grad_norm": 1.02050231, "learning_rate": 5e-07, "reward": 2.62949219, "reward_std": 0.86726315, "frac_reward_zero_std": 0.7, "rewards/MazeReward/mean": 0.138125, "rewards/MazeReward/std": 0.21768845, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 154.9078125, "completions/min_length": 54.6, "completions/max_length": 406.6, "completions/clipped_ratio": 0.0, "kl": 0.08808447, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.93162393, "global_step/max_steps": "1045/2000", "percentage": "52.25%", "elapsed_time": "10h 16m 44s", "remaining_time": "9h 23m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02824} {"loss": 0.00317292, "grad_norm": 0.80921657, "learning_rate": 5e-07, "reward": 2.503125, "reward_std": 0.53585547, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.1253125, "rewards/MazeReward/std": 0.1783435, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 153.0296875, "completions/min_length": 52.8, "completions/max_length": 379.2, "completions/clipped_ratio": 0.0, "kl": 0.07932715, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.97435897, "global_step/max_steps": "1050/2000", "percentage": "52.50%", "elapsed_time": "10h 19m 36s", "remaining_time": "9h 20m 35s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028244} {"loss": 0.00339748, "grad_norm": 1.35868786, "learning_rate": 5e-07, "reward": 2.8109375, "reward_std": 0.67715482, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.15625, "rewards/MazeReward/std": 0.19654485, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 152.5109375, "completions/min_length": 55.8, "completions/max_length": 421.0, "completions/clipped_ratio": 0.0, "kl": 0.08491902, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.01709402, "global_step/max_steps": "1055/2000", "percentage": "52.75%", "elapsed_time": "10h 22m 33s", "remaining_time": "9h 17m 38s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028244} {"loss": 0.00364397, "grad_norm": 0.3424304, "learning_rate": 4.9e-07, "reward": 2.803125, "reward_std": 0.67133673, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1553125, "rewards/MazeReward/std": 0.20383458, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 146.475, "completions/min_length": 52.0, "completions/max_length": 389.2, "completions/clipped_ratio": 0.0, "kl": 0.09108644, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.05982906, "global_step/max_steps": "1060/2000", "percentage": "53.00%", "elapsed_time": "10h 25m 26s", "remaining_time": "9h 14m 38s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028247} {"loss": 0.00418588, "grad_norm": 0.87512936, "learning_rate": 4.9e-07, "reward": 3.0625, "reward_std": 0.48923705, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.18125001, "rewards/MazeReward/std": 0.21844839, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 139.878125, "completions/min_length": 49.8, "completions/max_length": 390.2, "completions/clipped_ratio": 0.0, "kl": 0.10464337, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.1025641, "global_step/max_steps": "1065/2000", "percentage": "53.25%", "elapsed_time": "10h 28m 19s", "remaining_time": "9h 11m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02825} {"loss": 0.00428722, "grad_norm": 0.42757762, "learning_rate": 4.8e-07, "reward": 2.64375, "reward_std": 0.37823322, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.139375, "rewards/MazeReward/std": 0.16454499, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.3484375, "completions/min_length": 46.2, "completions/max_length": 419.4, "completions/clipped_ratio": 0.0, "kl": 0.10715237, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.14529915, "global_step/max_steps": "1070/2000", "percentage": "53.50%", "elapsed_time": "10h 31m 15s", "remaining_time": "9h 8m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028251} {"loss": 0.00449047, "grad_norm": 1.23474677, "learning_rate": 4.8e-07, "reward": 2.990625, "reward_std": 0.67840211, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.17406251, "rewards/MazeReward/std": 0.21663503, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.9015625, "completions/min_length": 48.8, "completions/max_length": 370.8, "completions/clipped_ratio": 0.0, "kl": 0.11225857, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.18803419, "global_step/max_steps": "1075/2000", "percentage": "53.75%", "elapsed_time": "10h 34m 4s", "remaining_time": "9h 5m 36s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028256} {"loss": 0.00434243, "grad_norm": 0.84690898, "learning_rate": 4.8e-07, "reward": 3.11875, "reward_std": 0.55710164, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.18687501, "rewards/MazeReward/std": 0.22037322, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.5484375, "completions/min_length": 53.2, "completions/max_length": 291.4, "completions/clipped_ratio": 0.0, "kl": 0.10855783, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.23076923, "global_step/max_steps": "1080/2000", "percentage": "54.00%", "elapsed_time": "10h 36m 49s", "remaining_time": "9h 2m 28s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028266} {"loss": 0.00377186, "grad_norm": 0.76044872, "learning_rate": 4.7e-07, "reward": 2.740625, "reward_std": 0.49681556, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.1490625, "rewards/MazeReward/std": 0.19011185, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.4609375, "completions/min_length": 47.6, "completions/max_length": 329.8, "completions/clipped_ratio": 0.0, "kl": 0.09429952, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.27350427, "global_step/max_steps": "1085/2000", "percentage": "54.25%", "elapsed_time": "10h 39m 36s", "remaining_time": "8h 59m 23s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028272} {"loss": 0.00437518, "grad_norm": 0.69654668, "learning_rate": 4.7e-07, "reward": 2.921875, "reward_std": 0.46210751, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.1671875, "rewards/MazeReward/std": 0.19805112, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.4609375, "completions/min_length": 53.2, "completions/max_length": 318.6, "completions/clipped_ratio": 0.0, "kl": 0.10939432, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.31623932, "global_step/max_steps": "1090/2000", "percentage": "54.50%", "elapsed_time": "10h 42m 24s", "remaining_time": "8h 56m 19s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028279} {"loss": 0.00398253, "grad_norm": 0.71592992, "learning_rate": 4.6e-07, "reward": 2.72636719, "reward_std": 0.52156622, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1478125, "rewards/MazeReward/std": 0.1959883, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 129.3796875, "completions/min_length": 47.2, "completions/max_length": 672.8, "completions/clipped_ratio": 0.0, "kl": 0.09956523, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.35897436, "global_step/max_steps": "1095/2000", "percentage": "54.75%", "elapsed_time": "10h 45m 50s", "remaining_time": "8h 53m 46s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028258} {"loss": 0.00435564, "grad_norm": 0.81527898, "learning_rate": 4.6e-07, "reward": 2.590625, "reward_std": 0.31248245, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1340625, "rewards/MazeReward/std": 0.17623247, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.275, "completions/min_length": 51.4, "completions/max_length": 349.4, "completions/clipped_ratio": 0.0, "kl": 0.10890262, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.4017094, "global_step/max_steps": "1100/2000", "percentage": "55.00%", "elapsed_time": "10h 48m 37s", "remaining_time": "8h 50m 41s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028265} {"loss": 0.00435421, "grad_norm": 1.01459424, "learning_rate": 4.5e-07, "reward": 2.721875, "reward_std": 0.26375698, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.1471875, "rewards/MazeReward/std": 0.19373294, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.2421875, "completions/min_length": 47.4, "completions/max_length": 332.2, "completions/clipped_ratio": 0.0, "kl": 0.10884021, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.44444444, "global_step/max_steps": "1105/2000", "percentage": "55.25%", "elapsed_time": "10h 52m 29s", "remaining_time": "8h 48m 29s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028225} {"loss": 0.0043663, "grad_norm": 1.1849331, "learning_rate": 4.5e-07, "reward": 2.540625, "reward_std": 0.52928144, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.1290625, "rewards/MazeReward/std": 0.20077509, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.5453125, "completions/min_length": 50.0, "completions/max_length": 339.6, "completions/clipped_ratio": 0.0, "kl": 0.10914052, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.48717949, "global_step/max_steps": "1110/2000", "percentage": "55.50%", "elapsed_time": "10h 55m 18s", "remaining_time": "8h 45m 25s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028231} {"loss": 0.00406612, "grad_norm": 1.28612507, "learning_rate": 4.5e-07, "reward": 2.771875, "reward_std": 0.45678749, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1521875, "rewards/MazeReward/std": 0.18613164, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 139.1921875, "completions/min_length": 45.0, "completions/max_length": 391.0, "completions/clipped_ratio": 0.0, "kl": 0.10163589, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.52991453, "global_step/max_steps": "1115/2000", "percentage": "55.75%", "elapsed_time": "10h 58m 10s", "remaining_time": "8h 42m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028235} {"loss": 0.00421871, "grad_norm": 1.50165899, "learning_rate": 4.4e-07, "reward": 2.7125, "reward_std": 0.53669436, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.14625, "rewards/MazeReward/std": 0.21170917, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.6546875, "completions/min_length": 50.8, "completions/max_length": 369.6, "completions/clipped_ratio": 0.0, "kl": 0.10546655, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.57264957, "global_step/max_steps": "1120/2000", "percentage": "56.00%", "elapsed_time": "11h 1m 2s", "remaining_time": "8h 39m 23s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028239} {"loss": 0.00382935, "grad_norm": 1.17988972, "learning_rate": 4.4e-07, "reward": 2.8375, "reward_std": 0.78995099, "frac_reward_zero_std": 0.6875, "rewards/MazeReward/mean": 0.15875, "rewards/MazeReward/std": 0.20503087, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 138.1640625, "completions/min_length": 50.4, "completions/max_length": 336.8, "completions/clipped_ratio": 0.0, "kl": 0.09572658, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.61538462, "global_step/max_steps": "1125/2000", "percentage": "56.25%", "elapsed_time": "11h 3m 48s", "remaining_time": "8h 36m 17s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028246} {"loss": 0.00390529, "grad_norm": 0.6998975, "learning_rate": 4.3e-07, "reward": 2.571875, "reward_std": 0.4550539, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.1321875, "rewards/MazeReward/std": 0.17721938, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.64375, "completions/min_length": 48.4, "completions/max_length": 363.8, "completions/clipped_ratio": 0.0, "kl": 0.09763574, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.65811966, "global_step/max_steps": "1130/2000", "percentage": "56.50%", "elapsed_time": "11h 6m 36s", "remaining_time": "8h 33m 14s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028252} {"loss": 0.00390236, "grad_norm": 0.5155983, "learning_rate": 4.3e-07, "reward": 2.928125, "reward_std": 0.55520095, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.1678125, "rewards/MazeReward/std": 0.20430524, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.034375, "completions/min_length": 51.2, "completions/max_length": 402.2, "completions/clipped_ratio": 0.0, "kl": 0.09754181, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.7008547, "global_step/max_steps": "1135/2000", "percentage": "56.75%", "elapsed_time": "11h 9m 30s", "remaining_time": "8h 30m 14s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028255} {"loss": 0.0045264, "grad_norm": 0.72267469, "learning_rate": 4.3e-07, "reward": 2.86875, "reward_std": 0.55278624, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.161875, "rewards/MazeReward/std": 0.21297796, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.05625, "completions/min_length": 52.4, "completions/max_length": 314.2, "completions/clipped_ratio": 0.0, "kl": 0.11315394, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.74358974, "global_step/max_steps": "1140/2000", "percentage": "57.00%", "elapsed_time": "11h 12m 13s", "remaining_time": "8h 27m 7s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028264} {"loss": 0.00426021, "grad_norm": 1.33995709, "learning_rate": 4.2e-07, "reward": 3.18886719, "reward_std": 0.47525118, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.1940625, "rewards/MazeReward/std": 0.21473246, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 128.296875, "completions/min_length": 49.4, "completions/max_length": 388.6, "completions/clipped_ratio": 0.0, "kl": 0.10648047, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.78632479, "global_step/max_steps": "1145/2000", "percentage": "57.25%", "elapsed_time": "11h 15m 8s", "remaining_time": "8h 24m 8s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028266} {"loss": 0.00418734, "grad_norm": 0.09503596, "learning_rate": 4.2e-07, "reward": 2.778125, "reward_std": 0.43286782, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.1528125, "rewards/MazeReward/std": 0.1956601, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.440625, "completions/min_length": 49.0, "completions/max_length": 400.6, "completions/clipped_ratio": 0.0, "kl": 0.10468062, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.82905983, "global_step/max_steps": "1150/2000", "percentage": "57.50%", "elapsed_time": "11h 18m 1s", "remaining_time": "8h 21m 8s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028269} {"loss": 0.00420615, "grad_norm": 0.7235398, "learning_rate": 4.1e-07, "reward": 2.775, "reward_std": 0.40023253, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.15250001, "rewards/MazeReward/std": 0.20179781, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.01875, "completions/min_length": 48.6, "completions/max_length": 412.2, "completions/clipped_ratio": 0.0, "kl": 0.10513887, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.87179487, "global_step/max_steps": "1155/2000", "percentage": "57.75%", "elapsed_time": "11h 20m 56s", "remaining_time": "8h 18m 10s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02827} {"loss": 0.00381863, "grad_norm": 0.37379541, "learning_rate": 4.1e-07, "reward": 2.584375, "reward_std": 0.29082662, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.1334375, "rewards/MazeReward/std": 0.19001711, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.5703125, "completions/min_length": 50.4, "completions/max_length": 358.4, "completions/clipped_ratio": 0.0, "kl": 0.09545613, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.91452991, "global_step/max_steps": "1160/2000", "percentage": "58.00%", "elapsed_time": "11h 23m 43s", "remaining_time": "8h 15m 6s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028277} {"loss": 0.00398742, "grad_norm": 0.06752439, "learning_rate": 4.1e-07, "reward": 2.4984375, "reward_std": 0.26228816, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.125, "rewards/MazeReward/std": 0.16459158, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.8875, "completions/min_length": 49.8, "completions/max_length": 522.4, "completions/clipped_ratio": 0.0, "kl": 0.09968637, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.95726496, "global_step/max_steps": "1165/2000", "percentage": "58.25%", "elapsed_time": "11h 26m 52s", "remaining_time": "8h 12m 18s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028268} {"loss": 0.00398657, "grad_norm": 0.87592156, "learning_rate": 4e-07, "reward": 2.490625, "reward_std": 0.46472559, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.1240625, "rewards/MazeReward/std": 0.1857353, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.1, "completions/min_length": 51.6, "completions/max_length": 415.6, "completions/clipped_ratio": 0.0, "kl": 0.09964474, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.0, "global_step/max_steps": "1170/2000", "percentage": "58.50%", "elapsed_time": "11h 29m 49s", "remaining_time": "8h 9m 22s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028268} {"loss": 0.00423451, "grad_norm": 0.9439809, "learning_rate": 4e-07, "reward": 3.0, "reward_std": 0.61874294, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.17500001, "rewards/MazeReward/std": 0.22677643, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.7765625, "completions/min_length": 49.4, "completions/max_length": 374.0, "completions/clipped_ratio": 0.0, "kl": 0.10586871, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.04273504, "global_step/max_steps": "1175/2000", "percentage": "58.75%", "elapsed_time": "11h 32m 45s", "remaining_time": "8h 6m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028269} {"loss": 0.00430933, "grad_norm": 0.52499202, "learning_rate": 3.9e-07, "reward": 3.1625, "reward_std": 0.5555284, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.19125, "rewards/MazeReward/std": 0.20346351, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2671875, "completions/min_length": 49.0, "completions/max_length": 312.0, "completions/clipped_ratio": 0.0, "kl": 0.10773047, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.08547009, "global_step/max_steps": "1180/2000", "percentage": "59.00%", "elapsed_time": "11h 35m 30s", "remaining_time": "8h 3m 19s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028277} {"loss": 0.00448348, "grad_norm": 0.88553374, "learning_rate": 3.9e-07, "reward": 2.740625, "reward_std": 0.71385045, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1490625, "rewards/MazeReward/std": 0.22811269, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.9328125, "completions/min_length": 48.8, "completions/max_length": 327.0, "completions/clipped_ratio": 0.0, "kl": 0.11209778, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.12820513, "global_step/max_steps": "1185/2000", "percentage": "59.25%", "elapsed_time": "11h 38m 15s", "remaining_time": "8h 0m 14s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028285} {"loss": 0.00451213, "grad_norm": 0.93216021, "learning_rate": 3.9e-07, "reward": 2.646875, "reward_std": 0.42052239, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1396875, "rewards/MazeReward/std": 0.19612097, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.625, "completions/min_length": 45.4, "completions/max_length": 319.2, "completions/clipped_ratio": 0.0, "kl": 0.11280471, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.17094017, "global_step/max_steps": "1190/2000", "percentage": "59.50%", "elapsed_time": "11h 41m 0s", "remaining_time": "7h 57m 9s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028293} {"loss": 0.0043687, "grad_norm": 0.51026986, "learning_rate": 3.8e-07, "reward": 3.01875, "reward_std": 0.43965511, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.176875, "rewards/MazeReward/std": 0.18922464, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.7015625, "completions/min_length": 46.6, "completions/max_length": 342.0, "completions/clipped_ratio": 0.0, "kl": 0.10920268, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.21367521, "global_step/max_steps": "1195/2000", "percentage": "59.75%", "elapsed_time": "11h 43m 47s", "remaining_time": "7h 54m 6s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028299} {"loss": 0.00459345, "grad_norm": 1.14957687, "learning_rate": 3.8e-07, "reward": 2.9375, "reward_std": 0.51868417, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.16875, "rewards/MazeReward/std": 0.21820463, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.0359375, "completions/min_length": 46.8, "completions/max_length": 362.8, "completions/clipped_ratio": 0.0, "kl": 0.1148172, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.25641026, "global_step/max_steps": "1200/2000", "percentage": "60.00%", "elapsed_time": "11h 46m 40s", "remaining_time": "7h 51m 6s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028302} {"loss": 0.00438366, "grad_norm": 0.63934289, "learning_rate": 3.7e-07, "reward": 2.653125, "reward_std": 0.38886548, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1403125, "rewards/MazeReward/std": 0.18066074, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.940625, "completions/min_length": 48.4, "completions/max_length": 345.6, "completions/clipped_ratio": 0.0, "kl": 0.10959113, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.2991453, "global_step/max_steps": "1205/2000", "percentage": "60.25%", "elapsed_time": "11h 50m 36s", "remaining_time": "7h 48m 49s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028262} {"loss": 0.00442606, "grad_norm": 0.65635445, "learning_rate": 3.7e-07, "reward": 2.778125, "reward_std": 0.49973741, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.1528125, "rewards/MazeReward/std": 0.18737705, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.88125, "completions/min_length": 48.2, "completions/max_length": 313.4, "completions/clipped_ratio": 0.0, "kl": 0.11064184, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.34188034, "global_step/max_steps": "1210/2000", "percentage": "60.50%", "elapsed_time": "11h 53m 21s", "remaining_time": "7h 45m 45s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02827} {"loss": 0.00487439, "grad_norm": 0.60227811, "learning_rate": 3.7e-07, "reward": 2.503125, "reward_std": 0.59016268, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1253125, "rewards/MazeReward/std": 0.19279995, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.975, "completions/min_length": 47.6, "completions/max_length": 364.8, "completions/clipped_ratio": 0.0, "kl": 0.12186299, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.38461538, "global_step/max_steps": "1215/2000", "percentage": "60.75%", "elapsed_time": "11h 56m 11s", "remaining_time": "7h 42m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028275} {"loss": 0.00488208, "grad_norm": 1.07736504, "learning_rate": 3.6e-07, "reward": 2.84375, "reward_std": 0.32815314, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.159375, "rewards/MazeReward/std": 0.19335278, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.75, "completions/min_length": 49.0, "completions/max_length": 355.2, "completions/clipped_ratio": 0.0, "kl": 0.12203081, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.42735043, "global_step/max_steps": "1220/2000", "percentage": "61.00%", "elapsed_time": "11h 59m 5s", "remaining_time": "7h 39m 44s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028276} {"loss": 0.00489514, "grad_norm": 0.45989036, "learning_rate": 3.6e-07, "reward": 2.425, "reward_std": 0.30953382, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.1175, "rewards/MazeReward/std": 0.17827851, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.4015625, "completions/min_length": 49.4, "completions/max_length": 360.4, "completions/clipped_ratio": 0.0, "kl": 0.12238693, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.47008547, "global_step/max_steps": "1225/2000", "percentage": "61.25%", "elapsed_time": "12h 1m 54s", "remaining_time": "7h 36m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028281} {"loss": 0.00421106, "grad_norm": 0.47652187, "learning_rate": 3.5e-07, "reward": 2.9625, "reward_std": 0.44157419, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.17125, "rewards/MazeReward/std": 0.20729262, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.1125, "completions/min_length": 47.8, "completions/max_length": 329.0, "completions/clipped_ratio": 0.0, "kl": 0.10526276, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.51282051, "global_step/max_steps": "1230/2000", "percentage": "61.50%", "elapsed_time": "12h 4m 40s", "remaining_time": "7h 33m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028288} {"loss": 0.00412562, "grad_norm": 0.06937887, "learning_rate": 3.5e-07, "reward": 2.4125, "reward_std": 0.35109097, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.11625, "rewards/MazeReward/std": 0.17364375, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.059375, "completions/min_length": 46.0, "completions/max_length": 376.4, "completions/clipped_ratio": 0.0, "kl": 0.10313205, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.55555556, "global_step/max_steps": "1235/2000", "percentage": "61.75%", "elapsed_time": "12h 7m 31s", "remaining_time": "7h 30m 38s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028293} {"loss": 0.00440944, "grad_norm": 1.15094124, "learning_rate": 3.5e-07, "reward": 3.0859375, "reward_std": 0.53763864, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.18375, "rewards/MazeReward/std": 0.20490943, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.6140625, "completions/min_length": 48.6, "completions/max_length": 664.0, "completions/clipped_ratio": 0.0, "kl": 0.11023346, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.5982906, "global_step/max_steps": "1240/2000", "percentage": "62.00%", "elapsed_time": "12h 10m 54s", "remaining_time": "7h 27m 58s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028275} {"loss": 0.0040922, "grad_norm": 0.59066127, "learning_rate": 3.4e-07, "reward": 2.975, "reward_std": 0.41274301, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1725, "rewards/MazeReward/std": 0.20381629, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.146875, "completions/min_length": 44.8, "completions/max_length": 379.6, "completions/clipped_ratio": 0.0, "kl": 0.10231563, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.64102564, "global_step/max_steps": "1245/2000", "percentage": "62.25%", "elapsed_time": "12h 13m 49s", "remaining_time": "7h 25m 0s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028277} {"loss": 0.00427463, "grad_norm": 1.09272524, "learning_rate": 3.4e-07, "reward": 3.0890625, "reward_std": 0.42894481, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.1840625, "rewards/MazeReward/std": 0.20211475, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.378125, "completions/min_length": 46.0, "completions/max_length": 712.8, "completions/clipped_ratio": 0.0015625, "kl": 0.10686525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.68376068, "global_step/max_steps": "1250/2000", "percentage": "62.50%", "elapsed_time": "12h 17m 22s", "remaining_time": "7h 22m 25s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028253} {"loss": 0.00432482, "grad_norm": 1.31584967, "learning_rate": 3.3e-07, "reward": 2.809375, "reward_std": 0.55491105, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1559375, "rewards/MazeReward/std": 0.21197488, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.9953125, "completions/min_length": 45.8, "completions/max_length": 354.8, "completions/clipped_ratio": 0.0, "kl": 0.10811928, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.72649573, "global_step/max_steps": "1255/2000", "percentage": "62.75%", "elapsed_time": "12h 20m 12s", "remaining_time": "7h 19m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028258} {"loss": 0.00428751, "grad_norm": 1.05985473, "learning_rate": 3.3e-07, "reward": 2.84199219, "reward_std": 0.6813662, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.159375, "rewards/MazeReward/std": 0.21843097, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 119.8640625, "completions/min_length": 45.4, "completions/max_length": 376.6, "completions/clipped_ratio": 0.0, "kl": 0.10716008, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.76923077, "global_step/max_steps": "1260/2000", "percentage": "63.00%", "elapsed_time": "12h 23m 4s", "remaining_time": "7h 16m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028261} {"loss": 0.00474503, "grad_norm": 1.05960373, "learning_rate": 3.3e-07, "reward": 2.846875, "reward_std": 0.61917041, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1596875, "rewards/MazeReward/std": 0.21988585, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.6578125, "completions/min_length": 46.2, "completions/max_length": 343.2, "completions/clipped_ratio": 0.0, "kl": 0.11861755, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.81196581, "global_step/max_steps": "1265/2000", "percentage": "63.25%", "elapsed_time": "12h 25m 51s", "remaining_time": "7h 13m 21s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028267} {"loss": 0.00442058, "grad_norm": 0.86391569, "learning_rate": 3.2e-07, "reward": 2.965625, "reward_std": 0.52689296, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1715625, "rewards/MazeReward/std": 0.19438618, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2828125, "completions/min_length": 47.4, "completions/max_length": 338.4, "completions/clipped_ratio": 0.0, "kl": 0.11051362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.85470085, "global_step/max_steps": "1270/2000", "percentage": "63.50%", "elapsed_time": "12h 28m 40s", "remaining_time": "7h 10m 20s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028272} {"loss": 0.00436118, "grad_norm": 1.01527461, "learning_rate": 3.2e-07, "reward": 2.9625, "reward_std": 0.2762804, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.17125, "rewards/MazeReward/std": 0.20243597, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.58125, "completions/min_length": 47.4, "completions/max_length": 323.8, "completions/clipped_ratio": 0.0, "kl": 0.1090246, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.8974359, "global_step/max_steps": "1275/2000", "percentage": "63.75%", "elapsed_time": "12h 31m 27s", "remaining_time": "7h 7m 17s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028279} {"loss": 0.00400066, "grad_norm": 0.82506123, "learning_rate": 3.1e-07, "reward": 2.6875, "reward_std": 0.57329983, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.14375001, "rewards/MazeReward/std": 0.18184928, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.1890625, "completions/min_length": 46.0, "completions/max_length": 334.2, "completions/clipped_ratio": 0.0, "kl": 0.10001397, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.94017094, "global_step/max_steps": "1280/2000", "percentage": "64.00%", "elapsed_time": "12h 34m 12s", "remaining_time": "7h 4m 14s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028286} {"loss": 0.00422045, "grad_norm": 0.87737452, "learning_rate": 3.1e-07, "reward": 2.759375, "reward_std": 0.63571578, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1509375, "rewards/MazeReward/std": 0.19424246, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.153125, "completions/min_length": 47.0, "completions/max_length": 376.2, "completions/clipped_ratio": 0.0, "kl": 0.10550106, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.98290598, "global_step/max_steps": "1285/2000", "percentage": "64.25%", "elapsed_time": "12h 37m 1s", "remaining_time": "7h 1m 13s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02829} {"loss": 0.00406795, "grad_norm": 0.66228383, "learning_rate": 3.1e-07, "reward": 3.0125, "reward_std": 0.44894702, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.17625, "rewards/MazeReward/std": 0.20109856, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.2421875, "completions/min_length": 46.2, "completions/max_length": 312.8, "completions/clipped_ratio": 0.0, "kl": 0.10168775, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.02564103, "global_step/max_steps": "1290/2000", "percentage": "64.50%", "elapsed_time": "12h 39m 48s", "remaining_time": "6h 58m 11s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028296} {"loss": 0.00421348, "grad_norm": 0.95402984, "learning_rate": 3e-07, "reward": 3.209375, "reward_std": 0.71374961, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1959375, "rewards/MazeReward/std": 0.25006937, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.7203125, "completions/min_length": 47.8, "completions/max_length": 337.4, "completions/clipped_ratio": 0.0, "kl": 0.10531652, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.06837607, "global_step/max_steps": "1295/2000", "percentage": "64.75%", "elapsed_time": "12h 42m 37s", "remaining_time": "6h 55m 10s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028301} {"loss": 0.00399229, "grad_norm": 0.86974724, "learning_rate": 3e-07, "reward": 3.35625, "reward_std": 0.40894377, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.210625, "rewards/MazeReward/std": 0.22344834, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.8359375, "completions/min_length": 46.2, "completions/max_length": 343.6, "completions/clipped_ratio": 0.0, "kl": 0.09979213, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.11111111, "global_step/max_steps": "1300/2000", "percentage": "65.00%", "elapsed_time": "12h 45m 27s", "remaining_time": "6h 52m 10s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028305} {"loss": 0.00414821, "grad_norm": 0.80387, "learning_rate": 3e-07, "reward": 2.81875, "reward_std": 0.17559238, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.156875, "rewards/MazeReward/std": 0.18436635, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.2, "completions/min_length": 46.8, "completions/max_length": 321.2, "completions/clipped_ratio": 0.0, "kl": 0.10371306, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.15384615, "global_step/max_steps": "1305/2000", "percentage": "65.25%", "elapsed_time": "12h 49m 17s", "remaining_time": "6h 49m 42s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028273} {"loss": 0.00422846, "grad_norm": 1.16855156, "learning_rate": 2.9e-07, "reward": 2.9125, "reward_std": 0.61308444, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.16625, "rewards/MazeReward/std": 0.20742922, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.8859375, "completions/min_length": 45.0, "completions/max_length": 389.4, "completions/clipped_ratio": 0.0, "kl": 0.10571901, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.1965812, "global_step/max_steps": "1310/2000", "percentage": "65.50%", "elapsed_time": "12h 52m 10s", "remaining_time": "6h 46m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028275} {"loss": 0.00397921, "grad_norm": 0.95254097, "learning_rate": 2.9e-07, "reward": 2.715625, "reward_std": 0.63621098, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.1465625, "rewards/MazeReward/std": 0.20846615, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.5921875, "completions/min_length": 44.8, "completions/max_length": 346.8, "completions/clipped_ratio": 0.0, "kl": 0.09947788, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.23931624, "global_step/max_steps": "1315/2000", "percentage": "65.75%", "elapsed_time": "12h 54m 58s", "remaining_time": "6h 43m 41s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028281} {"loss": 0.00397209, "grad_norm": 1.01274306, "learning_rate": 2.8e-07, "reward": 2.8421875, "reward_std": 0.5440542, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.159375, "rewards/MazeReward/std": 0.20577182, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.5765625, "completions/min_length": 48.0, "completions/max_length": 613.8, "completions/clipped_ratio": 0.0, "kl": 0.09928331, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.28205128, "global_step/max_steps": "1320/2000", "percentage": "66.00%", "elapsed_time": "12h 58m 16s", "remaining_time": "6h 40m 55s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028267} {"loss": 0.00423153, "grad_norm": 0.50376485, "learning_rate": 2.8e-07, "reward": 2.6625, "reward_std": 0.54982884, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.14125, "rewards/MazeReward/std": 0.20351186, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.5203125, "completions/min_length": 49.8, "completions/max_length": 345.0, "completions/clipped_ratio": 0.0, "kl": 0.10580486, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.32478632, "global_step/max_steps": "1325/2000", "percentage": "66.25%", "elapsed_time": "13h 1m 4s", "remaining_time": "6h 37m 54s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028273} {"loss": 0.0041853, "grad_norm": 1.03457832, "learning_rate": 2.8e-07, "reward": 2.89375, "reward_std": 0.33696596, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.164375, "rewards/MazeReward/std": 0.19246908, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.934375, "completions/min_length": 45.0, "completions/max_length": 364.4, "completions/clipped_ratio": 0.0, "kl": 0.10461925, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.36752137, "global_step/max_steps": "1330/2000", "percentage": "66.50%", "elapsed_time": "13h 3m 54s", "remaining_time": "6h 34m 53s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028277} {"loss": 0.00469844, "grad_norm": 0.63358184, "learning_rate": 2.7e-07, "reward": 2.909375, "reward_std": 0.55279526, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1659375, "rewards/MazeReward/std": 0.22284969, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.09375, "completions/min_length": 44.4, "completions/max_length": 341.0, "completions/clipped_ratio": 0.0, "kl": 0.11745279, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.41025641, "global_step/max_steps": "1335/2000", "percentage": "66.75%", "elapsed_time": "13h 6m 39s", "remaining_time": "6h 31m 51s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028284} {"loss": 0.00506243, "grad_norm": 0.13903079, "learning_rate": 2.7e-07, "reward": 3.159375, "reward_std": 0.33805927, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.1909375, "rewards/MazeReward/std": 0.22119205, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.3859375, "completions/min_length": 47.0, "completions/max_length": 313.4, "completions/clipped_ratio": 0.0, "kl": 0.12655634, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.45299145, "global_step/max_steps": "1340/2000", "percentage": "67.00%", "elapsed_time": "13h 9m 22s", "remaining_time": "6h 28m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028292} {"loss": 0.00498795, "grad_norm": 0.97234794, "learning_rate": 2.7e-07, "reward": 2.85, "reward_std": 0.75886427, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.16, "rewards/MazeReward/std": 0.20993178, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.978125, "completions/min_length": 44.0, "completions/max_length": 340.4, "completions/clipped_ratio": 0.0, "kl": 0.12469737, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.4957265, "global_step/max_steps": "1345/2000", "percentage": "67.25%", "elapsed_time": "13h 12m 12s", "remaining_time": "6h 25m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028297} {"loss": 0.00515069, "grad_norm": 0.5534387, "learning_rate": 2.6e-07, "reward": 2.846875, "reward_std": 0.3248968, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.15968751, "rewards/MazeReward/std": 0.20917013, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 109.29375, "completions/min_length": 45.8, "completions/max_length": 285.8, "completions/clipped_ratio": 0.0, "kl": 0.12874501, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.53846154, "global_step/max_steps": "1350/2000", "percentage": "67.50%", "elapsed_time": "13h 14m 52s", "remaining_time": "6h 22m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028306} {"loss": 0.00475426, "grad_norm": 0.94145282, "learning_rate": 2.6e-07, "reward": 3.0359375, "reward_std": 0.48523493, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.17875, "rewards/MazeReward/std": 0.21105008, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.83125, "completions/min_length": 45.6, "completions/max_length": 634.2, "completions/clipped_ratio": 0.0015625, "kl": 0.11885395, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.58119658, "global_step/max_steps": "1355/2000", "percentage": "67.75%", "elapsed_time": "13h 18m 15s", "remaining_time": "6h 19m 58s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028291} {"loss": 0.0051045, "grad_norm": 1.18056737, "learning_rate": 2.5e-07, "reward": 2.9171875, "reward_std": 0.59616685, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.166875, "rewards/MazeReward/std": 0.21798062, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.1015625, "completions/min_length": 47.6, "completions/max_length": 666.6, "completions/clipped_ratio": 0.0015625, "kl": 0.12760761, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.62393162, "global_step/max_steps": "1360/2000", "percentage": "68.00%", "elapsed_time": "13h 21m 43s", "remaining_time": "6h 17m 16s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028272} {"loss": 0.00476524, "grad_norm": 1.17158982, "learning_rate": 2.5e-07, "reward": 3.478125, "reward_std": 0.70050123, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.2228125, "rewards/MazeReward/std": 0.25120379, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.7140625, "completions/min_length": 46.8, "completions/max_length": 361.0, "completions/clipped_ratio": 0.0, "kl": 0.11911844, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.66666667, "global_step/max_steps": "1365/2000", "percentage": "68.25%", "elapsed_time": "13h 24m 37s", "remaining_time": "6h 14m 18s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028274} {"loss": 0.00464483, "grad_norm": 0.87563163, "learning_rate": 2.5e-07, "reward": 3.053125, "reward_std": 0.50678086, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.18031251, "rewards/MazeReward/std": 0.23398211, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.2796875, "completions/min_length": 47.2, "completions/max_length": 329.0, "completions/clipped_ratio": 0.0, "kl": 0.11611179, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.70940171, "global_step/max_steps": "1370/2000", "percentage": "68.50%", "elapsed_time": "13h 27m 23s", "remaining_time": "6h 11m 16s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028281} {"loss": 0.00449227, "grad_norm": 1.25901833, "learning_rate": 2.4e-07, "reward": 3.503125, "reward_std": 0.62686791, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.22531251, "rewards/MazeReward/std": 0.21077939, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.0765625, "completions/min_length": 46.2, "completions/max_length": 336.0, "completions/clipped_ratio": 0.0, "kl": 0.112287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.75213675, "global_step/max_steps": "1375/2000", "percentage": "68.75%", "elapsed_time": "13h 30m 7s", "remaining_time": "6h 8m 14s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028288} {"loss": 0.00481081, "grad_norm": 1.19929295, "learning_rate": 2.4e-07, "reward": 2.684375, "reward_std": 0.4628313, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.1434375, "rewards/MazeReward/std": 0.20171171, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.615625, "completions/min_length": 45.6, "completions/max_length": 312.8, "completions/clipped_ratio": 0.0, "kl": 0.12027763, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.79487179, "global_step/max_steps": "1380/2000", "percentage": "69.00%", "elapsed_time": "13h 32m 53s", "remaining_time": "6h 5m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028294} {"loss": 0.00451373, "grad_norm": 1.01039958, "learning_rate": 2.4e-07, "reward": 2.925, "reward_std": 0.63010332, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1675, "rewards/MazeReward/std": 0.22027081, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.6328125, "completions/min_length": 48.6, "completions/max_length": 316.2, "completions/clipped_ratio": 0.0, "kl": 0.11282905, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.83760684, "global_step/max_steps": "1385/2000", "percentage": "69.25%", "elapsed_time": "13h 35m 41s", "remaining_time": "6h 2m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028299} {"loss": 0.00429264, "grad_norm": 0.93743296, "learning_rate": 2.3e-07, "reward": 3.290625, "reward_std": 0.55763697, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.2040625, "rewards/MazeReward/std": 0.22283052, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.1765625, "completions/min_length": 46.2, "completions/max_length": 355.6, "completions/clipped_ratio": 0.0, "kl": 0.10730244, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.88034188, "global_step/max_steps": "1390/2000", "percentage": "69.50%", "elapsed_time": "13h 38m 30s", "remaining_time": "5h 59m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028304} {"loss": 0.00450364, "grad_norm": 0.70253006, "learning_rate": 2.3e-07, "reward": 2.625, "reward_std": 0.7034467, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.13750001, "rewards/MazeReward/std": 0.21581574, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.728125, "completions/min_length": 48.6, "completions/max_length": 336.4, "completions/clipped_ratio": 0.0, "kl": 0.11257845, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.92307692, "global_step/max_steps": "1395/2000", "percentage": "69.75%", "elapsed_time": "13h 41m 14s", "remaining_time": "5h 56m 9s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028311} {"loss": 0.00444048, "grad_norm": 0.96301074, "learning_rate": 2.3e-07, "reward": 3.009375, "reward_std": 0.62718031, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.17593751, "rewards/MazeReward/std": 0.21609465, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.29375, "completions/min_length": 48.8, "completions/max_length": 351.2, "completions/clipped_ratio": 0.0, "kl": 0.11099038, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.96581197, "global_step/max_steps": "1400/2000", "percentage": "70.00%", "elapsed_time": "13h 44m 1s", "remaining_time": "5h 53m 9s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028316} {"loss": 0.00426018, "grad_norm": 1.050247, "learning_rate": 2.2e-07, "reward": 3.21875, "reward_std": 0.58231249, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.196875, "rewards/MazeReward/std": 0.22662649, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.184375, "completions/min_length": 46.8, "completions/max_length": 330.0, "completions/clipped_ratio": 0.0, "kl": 0.10649899, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.00854701, "global_step/max_steps": "1405/2000", "percentage": "70.25%", "elapsed_time": "13h 47m 50s", "remaining_time": "5h 50m 34s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028287} {"loss": 0.00442955, "grad_norm": 1.4774073, "learning_rate": 2.2e-07, "reward": 2.475, "reward_std": 0.42800033, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1225, "rewards/MazeReward/std": 0.19068383, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.7, "completions/min_length": 50.8, "completions/max_length": 327.4, "completions/clipped_ratio": 0.0, "kl": 0.11073335, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.05128205, "global_step/max_steps": "1410/2000", "percentage": "70.50%", "elapsed_time": "13h 50m 35s", "remaining_time": "5h 47m 33s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028293} {"loss": 0.00440788, "grad_norm": 0.80236902, "learning_rate": 2.2e-07, "reward": 2.846875, "reward_std": 0.75270727, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1596875, "rewards/MazeReward/std": 0.21881043, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.815625, "completions/min_length": 48.4, "completions/max_length": 391.4, "completions/clipped_ratio": 0.0, "kl": 0.11019803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.09401709, "global_step/max_steps": "1415/2000", "percentage": "70.75%", "elapsed_time": "13h 53m 26s", "remaining_time": "5h 44m 34s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028296} {"loss": 0.00465553, "grad_norm": 1.14392062, "learning_rate": 2.1e-07, "reward": 3.15, "reward_std": 0.72762293, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.19000001, "rewards/MazeReward/std": 0.23026613, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.5859375, "completions/min_length": 47.0, "completions/max_length": 357.8, "completions/clipped_ratio": 0.0, "kl": 0.11639515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.13675214, "global_step/max_steps": "1420/2000", "percentage": "71.00%", "elapsed_time": "13h 56m 15s", "remaining_time": "5h 41m 34s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028301} {"loss": 0.00460033, "grad_norm": 0.07746797, "learning_rate": 2.1e-07, "reward": 2.984375, "reward_std": 0.45669057, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1734375, "rewards/MazeReward/std": 0.21304076, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.9546875, "completions/min_length": 48.4, "completions/max_length": 343.6, "completions/clipped_ratio": 0.0, "kl": 0.11500274, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.17948718, "global_step/max_steps": "1425/2000", "percentage": "71.25%", "elapsed_time": "13h 59m 0s", "remaining_time": "5h 38m 32s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028307} {"loss": 0.00443986, "grad_norm": 0.96384978, "learning_rate": 2.1e-07, "reward": 2.915625, "reward_std": 0.76699025, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.1665625, "rewards/MazeReward/std": 0.21833944, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.6953125, "completions/min_length": 44.0, "completions/max_length": 340.6, "completions/clipped_ratio": 0.0, "kl": 0.11099969, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.22222222, "global_step/max_steps": "1430/2000", "percentage": "71.50%", "elapsed_time": "14h 1m 47s", "remaining_time": "5h 35m 32s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028313} {"loss": 0.00461623, "grad_norm": 0.90580445, "learning_rate": 2e-07, "reward": 2.8, "reward_std": 1.03530844, "frac_reward_zero_std": 0.6875, "rewards/MazeReward/mean": 0.155, "rewards/MazeReward/std": 0.23934915, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.51875, "completions/min_length": 47.2, "completions/max_length": 350.4, "completions/clipped_ratio": 0.0, "kl": 0.11539947, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.26495726, "global_step/max_steps": "1435/2000", "percentage": "71.75%", "elapsed_time": "14h 4m 36s", "remaining_time": "5h 32m 32s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028317} {"loss": 0.00449113, "grad_norm": 1.46370204, "learning_rate": 2e-07, "reward": 2.90625, "reward_std": 0.48007759, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.16562499, "rewards/MazeReward/std": 0.20969884, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.1828125, "completions/min_length": 48.0, "completions/max_length": 315.0, "completions/clipped_ratio": 0.0, "kl": 0.11225918, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.30769231, "global_step/max_steps": "1440/2000", "percentage": "72.00%", "elapsed_time": "14h 7m 19s", "remaining_time": "5h 29m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028324} {"loss": 0.00411971, "grad_norm": 0.68106331, "learning_rate": 2e-07, "reward": 3.08125, "reward_std": 0.53479406, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.18312501, "rewards/MazeReward/std": 0.22275058, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.4625, "completions/min_length": 46.8, "completions/max_length": 345.2, "completions/clipped_ratio": 0.0, "kl": 0.10299722, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.35042735, "global_step/max_steps": "1445/2000", "percentage": "72.25%", "elapsed_time": "14h 10m 8s", "remaining_time": "5h 26m 31s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028329} {"loss": 0.00481523, "grad_norm": 0.7530646, "learning_rate": 1.9e-07, "reward": 3.05625, "reward_std": 0.24052047, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.18062501, "rewards/MazeReward/std": 0.1982919, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.7078125, "completions/min_length": 45.4, "completions/max_length": 322.2, "completions/clipped_ratio": 0.0, "kl": 0.1203703, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.39316239, "global_step/max_steps": "1450/2000", "percentage": "72.50%", "elapsed_time": "14h 12m 51s", "remaining_time": "5h 23m 29s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028336} {"loss": 0.00460099, "grad_norm": 0.88552288, "learning_rate": 1.9e-07, "reward": 2.825, "reward_std": 0.35327176, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.1575, "rewards/MazeReward/std": 0.19714967, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.1796875, "completions/min_length": 45.6, "completions/max_length": 370.0, "completions/clipped_ratio": 0.0, "kl": 0.11501932, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.43589744, "global_step/max_steps": "1455/2000", "percentage": "72.75%", "elapsed_time": "14h 15m 40s", "remaining_time": "5h 20m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02834} {"loss": 0.00438458, "grad_norm": 1.07072299, "learning_rate": 1.9e-07, "reward": 3.20625, "reward_std": 0.58653409, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.195625, "rewards/MazeReward/std": 0.2440807, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.2359375, "completions/min_length": 50.6, "completions/max_length": 348.6, "completions/clipped_ratio": 0.0, "kl": 0.10961649, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.47863248, "global_step/max_steps": "1460/2000", "percentage": "73.00%", "elapsed_time": "14h 18m 26s", "remaining_time": "5h 17m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028346} {"loss": 0.0042128, "grad_norm": 0.83440075, "learning_rate": 1.8e-07, "reward": 2.86875, "reward_std": 0.47283043, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.161875, "rewards/MazeReward/std": 0.20645066, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.0171875, "completions/min_length": 48.0, "completions/max_length": 340.0, "completions/clipped_ratio": 0.0, "kl": 0.10531781, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.52136752, "global_step/max_steps": "1465/2000", "percentage": "73.25%", "elapsed_time": "14h 21m 11s", "remaining_time": "5h 14m 29s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028352} {"loss": 0.00442708, "grad_norm": 0.76803178, "learning_rate": 1.8e-07, "reward": 2.95625, "reward_std": 0.48648316, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.17062501, "rewards/MazeReward/std": 0.21959133, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.6984375, "completions/min_length": 46.6, "completions/max_length": 344.4, "completions/clipped_ratio": 0.0, "kl": 0.11068193, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.56410256, "global_step/max_steps": "1470/2000", "percentage": "73.50%", "elapsed_time": "14h 23m 59s", "remaining_time": "5h 11m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028356} {"loss": 0.00436642, "grad_norm": 0.82136768, "learning_rate": 1.8e-07, "reward": 3.153125, "reward_std": 0.48257118, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.1903125, "rewards/MazeReward/std": 0.22344301, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.28125, "completions/min_length": 48.8, "completions/max_length": 341.8, "completions/clipped_ratio": 0.0, "kl": 0.10914505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.60683761, "global_step/max_steps": "1475/2000", "percentage": "73.75%", "elapsed_time": "14h 26m 46s", "remaining_time": "5h 8m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028362} {"loss": 0.00409499, "grad_norm": 0.4507625, "learning_rate": 1.7e-07, "reward": 3.3, "reward_std": 0.45387779, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.205, "rewards/MazeReward/std": 0.23486512, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.70625, "completions/min_length": 47.6, "completions/max_length": 356.8, "completions/clipped_ratio": 0.0, "kl": 0.10233986, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.64957265, "global_step/max_steps": "1480/2000", "percentage": "74.00%", "elapsed_time": "14h 29m 34s", "remaining_time": "5h 5m 31s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028366} {"loss": 0.00416483, "grad_norm": 0.4974367, "learning_rate": 1.7e-07, "reward": 3.1796875, "reward_std": 0.21673506, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.193125, "rewards/MazeReward/std": 0.21075069, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.1671875, "completions/min_length": 49.4, "completions/max_length": 353.8, "completions/clipped_ratio": 0.0, "kl": 0.10411405, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.69230769, "global_step/max_steps": "1485/2000", "percentage": "74.25%", "elapsed_time": "14h 32m 23s", "remaining_time": "5h 2m 32s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02837} {"loss": 0.00423412, "grad_norm": 1.31048709, "learning_rate": 1.7e-07, "reward": 3.259375, "reward_std": 0.85263252, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.20093751, "rewards/MazeReward/std": 0.23574598, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.65625, "completions/min_length": 49.2, "completions/max_length": 417.4, "completions/clipped_ratio": 0.0, "kl": 0.10584182, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.73504274, "global_step/max_steps": "1490/2000", "percentage": "74.50%", "elapsed_time": "14h 35m 22s", "remaining_time": "4h 59m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028369} {"loss": 0.00418281, "grad_norm": 1.15241972, "learning_rate": 1.6e-07, "reward": 3.3875, "reward_std": 0.81127899, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.21375, "rewards/MazeReward/std": 0.24604513, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.7375, "completions/min_length": 49.0, "completions/max_length": 435.8, "completions/clipped_ratio": 0.0, "kl": 0.1045557, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.77777778, "global_step/max_steps": "1495/2000", "percentage": "74.75%", "elapsed_time": "14h 38m 20s", "remaining_time": "4h 56m 41s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028368} {"loss": 0.00410249, "grad_norm": 1.03938228, "learning_rate": 1.6e-07, "reward": 3.134375, "reward_std": 0.5951692, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.1884375, "rewards/MazeReward/std": 0.22971763, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.11875, "completions/min_length": 49.6, "completions/max_length": 458.0, "completions/clipped_ratio": 0.0, "kl": 0.10254335, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.82051282, "global_step/max_steps": "1500/2000", "percentage": "75.00%", "elapsed_time": "14h 41m 21s", "remaining_time": "4h 53m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028365} {"loss": 0.00400581, "grad_norm": 1.05728668, "learning_rate": 1.6e-07, "reward": 3.2625, "reward_std": 0.42726558, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.20125, "rewards/MazeReward/std": 0.20636436, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.5453125, "completions/min_length": 46.4, "completions/max_length": 359.6, "completions/clipped_ratio": 0.0, "kl": 0.10014389, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.86324786, "global_step/max_steps": "1505/2000", "percentage": "75.25%", "elapsed_time": "14h 45m 23s", "remaining_time": "4h 51m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02833} {"loss": 0.00405019, "grad_norm": 0.92285176, "learning_rate": 1.6e-07, "reward": 2.9953125, "reward_std": 0.64275012, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1746875, "rewards/MazeReward/std": 0.21778047, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.6796875, "completions/min_length": 48.0, "completions/max_length": 713.0, "completions/clipped_ratio": 0.0015625, "kl": 0.10124102, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.90598291, "global_step/max_steps": "1510/2000", "percentage": "75.50%", "elapsed_time": "14h 48m 52s", "remaining_time": "4h 48m 26s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028313} {"loss": 0.00420048, "grad_norm": 0.70581545, "learning_rate": 1.5e-07, "reward": 3.44667969, "reward_std": 0.51608422, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.2196875, "rewards/MazeReward/std": 0.23806649, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 134.66875, "completions/min_length": 48.0, "completions/max_length": 389.0, "completions/clipped_ratio": 0.0, "kl": 0.10501653, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.94871795, "global_step/max_steps": "1515/2000", "percentage": "75.75%", "elapsed_time": "14h 51m 44s", "remaining_time": "4h 45m 28s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028316} {"loss": 0.0041741, "grad_norm": 0.44572165, "learning_rate": 1.5e-07, "reward": 3.203125, "reward_std": 0.42872139, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.1953125, "rewards/MazeReward/std": 0.24020221, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.31875, "completions/min_length": 48.6, "completions/max_length": 415.6, "completions/clipped_ratio": 0.0, "kl": 0.10435408, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.99145299, "global_step/max_steps": "1520/2000", "percentage": "76.00%", "elapsed_time": "14h 54m 39s", "remaining_time": "4h 42m 31s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028316} {"loss": 0.00413442, "grad_norm": 1.16019461, "learning_rate": 1.5e-07, "reward": 3.31875, "reward_std": 0.76833533, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.206875, "rewards/MazeReward/std": 0.25922371, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.5453125, "completions/min_length": 50.6, "completions/max_length": 430.4, "completions/clipped_ratio": 0.0, "kl": 0.10335387, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.03418803, "global_step/max_steps": "1525/2000", "percentage": "76.25%", "elapsed_time": "14h 57m 38s", "remaining_time": "4h 39m 35s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028315} {"loss": 0.00407867, "grad_norm": 0.95080851, "learning_rate": 1.4e-07, "reward": 3.553125, "reward_std": 0.75468747, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.23031251, "rewards/MazeReward/std": 0.24887472, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.1671875, "completions/min_length": 47.6, "completions/max_length": 354.8, "completions/clipped_ratio": 0.0, "kl": 0.10196629, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.07692308, "global_step/max_steps": "1530/2000", "percentage": "76.50%", "elapsed_time": "15h 0m 26s", "remaining_time": "4h 36m 36s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028319} {"loss": 0.0039829, "grad_norm": 0.92700666, "learning_rate": 1.4e-07, "reward": 3.271875, "reward_std": 0.40897448, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.20218751, "rewards/MazeReward/std": 0.22852748, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.328125, "completions/min_length": 51.2, "completions/max_length": 342.6, "completions/clipped_ratio": 0.0, "kl": 0.09956137, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.11965812, "global_step/max_steps": "1535/2000", "percentage": "76.75%", "elapsed_time": "15h 3m 14s", "remaining_time": "4h 33m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028324} {"loss": 0.00396872, "grad_norm": 0.61241609, "learning_rate": 1.4e-07, "reward": 2.96875, "reward_std": 0.79036005, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.171875, "rewards/MazeReward/std": 0.23524323, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.9625, "completions/min_length": 50.6, "completions/max_length": 441.8, "completions/clipped_ratio": 0.0, "kl": 0.09921041, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.16239316, "global_step/max_steps": "1540/2000", "percentage": "77.00%", "elapsed_time": "15h 6m 14s", "remaining_time": "4h 30m 41s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028322} {"loss": 0.0041557, "grad_norm": 1.02846123, "learning_rate": 1.3e-07, "reward": 3.121875, "reward_std": 0.49660008, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1871875, "rewards/MazeReward/std": 0.24737908, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 137.615625, "completions/min_length": 50.0, "completions/max_length": 379.4, "completions/clipped_ratio": 0.0, "kl": 0.10387767, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.20512821, "global_step/max_steps": "1545/2000", "percentage": "77.25%", "elapsed_time": "15h 9m 3s", "remaining_time": "4h 27m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028326} {"loss": 0.00407104, "grad_norm": 1.16062176, "learning_rate": 1.3e-07, "reward": 3.05761719, "reward_std": 0.6226504, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.18093751, "rewards/MazeReward/std": 0.2325268, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 135.1421875, "completions/min_length": 48.0, "completions/max_length": 413.4, "completions/clipped_ratio": 0.0, "kl": 0.10176368, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.24786325, "global_step/max_steps": "1550/2000", "percentage": "77.50%", "elapsed_time": "15h 12m 2s", "remaining_time": "4h 24m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028325} {"loss": 0.00423348, "grad_norm": 0.05934888, "learning_rate": 1.3e-07, "reward": 3.5515625, "reward_std": 0.56461858, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.23031251, "rewards/MazeReward/std": 0.23733252, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.353125, "completions/min_length": 52.4, "completions/max_length": 402.6, "completions/clipped_ratio": 0.0, "kl": 0.10582958, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.29059829, "global_step/max_steps": "1555/2000", "percentage": "77.75%", "elapsed_time": "15h 14m 57s", "remaining_time": "4h 21m 50s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028326} {"loss": 0.00375819, "grad_norm": 0.89672781, "learning_rate": 1.3e-07, "reward": 3.31875, "reward_std": 0.81194406, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.20687501, "rewards/MazeReward/std": 0.26889552, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 140.5203125, "completions/min_length": 54.2, "completions/max_length": 381.8, "completions/clipped_ratio": 0.0, "kl": 0.09393653, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.33333333, "global_step/max_steps": "1560/2000", "percentage": "78.00%", "elapsed_time": "15h 17m 50s", "remaining_time": "4h 18m 52s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028327} {"loss": 0.00395788, "grad_norm": 0.66898871, "learning_rate": 1.2e-07, "reward": 3.55, "reward_std": 0.37789677, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.23000001, "rewards/MazeReward/std": 0.22691604, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 143.8890625, "completions/min_length": 51.4, "completions/max_length": 452.0, "completions/clipped_ratio": 0.0, "kl": 0.09893502, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.37606838, "global_step/max_steps": "1565/2000", "percentage": "78.25%", "elapsed_time": "15h 20m 54s", "remaining_time": "4h 15m 58s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028324} {"loss": 0.00356725, "grad_norm": 0.83056848, "learning_rate": 1.2e-07, "reward": 3.24375, "reward_std": 0.55531446, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.19937501, "rewards/MazeReward/std": 0.23777499, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.734375, "completions/min_length": 49.6, "completions/max_length": 350.2, "completions/clipped_ratio": 0.0, "kl": 0.0891789, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.41880342, "global_step/max_steps": "1570/2000", "percentage": "78.50%", "elapsed_time": "15h 23m 42s", "remaining_time": "4h 12m 59s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028328} {"loss": 0.00413149, "grad_norm": 0.63416208, "learning_rate": 1.2e-07, "reward": 3.425, "reward_std": 0.67447208, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.2175, "rewards/MazeReward/std": 0.25507333, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.6171875, "completions/min_length": 55.0, "completions/max_length": 365.2, "completions/clipped_ratio": 0.0, "kl": 0.10329319, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.46153846, "global_step/max_steps": "1575/2000", "percentage": "78.75%", "elapsed_time": "15h 26m 32s", "remaining_time": "4h 10m 1s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028331} {"loss": 0.00384293, "grad_norm": 0.46143972, "learning_rate": 1.2e-07, "reward": 3.584375, "reward_std": 0.52868327, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.2334375, "rewards/MazeReward/std": 0.25245186, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.471875, "completions/min_length": 54.2, "completions/max_length": 343.4, "completions/clipped_ratio": 0.0, "kl": 0.09606116, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.5042735, "global_step/max_steps": "1580/2000", "percentage": "79.00%", "elapsed_time": "15h 29m 19s", "remaining_time": "4h 7m 2s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028336} {"loss": 0.00365998, "grad_norm": 1.02387218, "learning_rate": 1.1e-07, "reward": 3.384375, "reward_std": 0.64773528, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.21375001, "rewards/MazeReward/std": 0.21816131, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490138, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 144.0140625, "completions/min_length": 50.0, "completions/max_length": 727.8, "completions/clipped_ratio": 0.0015625, "kl": 0.09148285, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.54700855, "global_step/max_steps": "1585/2000", "percentage": "79.25%", "elapsed_time": "15h 32m 53s", "remaining_time": "4h 4m 15s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028317} {"loss": 0.00387716, "grad_norm": 0.95212518, "learning_rate": 1.1e-07, "reward": 3.640625, "reward_std": 0.95441318, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.2390625, "rewards/MazeReward/std": 0.26780157, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 138.340625, "completions/min_length": 55.0, "completions/max_length": 391.0, "completions/clipped_ratio": 0.0, "kl": 0.09690871, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.58974359, "global_step/max_steps": "1590/2000", "percentage": "79.50%", "elapsed_time": "15h 35m 50s", "remaining_time": "4h 1m 19s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028317} {"loss": 0.00373193, "grad_norm": 0.04557116, "learning_rate": 1.1e-07, "reward": 3.0359375, "reward_std": 0.41956892, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.17875, "rewards/MazeReward/std": 0.21701315, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 141.0671875, "completions/min_length": 50.2, "completions/max_length": 684.0, "completions/clipped_ratio": 0.0015625, "kl": 0.09328376, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.63247863, "global_step/max_steps": "1595/2000", "percentage": "79.75%", "elapsed_time": "15h 39m 20s", "remaining_time": "3h 58m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.0283} {"loss": 0.0041183, "grad_norm": 1.20256315, "learning_rate": 1.1e-07, "reward": 3.2875, "reward_std": 0.61734234, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.20375, "rewards/MazeReward/std": 0.22543152, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.0828125, "completions/min_length": 49.4, "completions/max_length": 374.2, "completions/clipped_ratio": 0.0, "kl": 0.10294594, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.67521368, "global_step/max_steps": "1600/2000", "percentage": "80.00%", "elapsed_time": "15h 42m 11s", "remaining_time": "3h 55m 32s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028303} {"loss": 0.00415633, "grad_norm": 0.77968615, "learning_rate": 1e-07, "reward": 3.38574219, "reward_std": 0.74480401, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.21375, "rewards/MazeReward/std": 0.25732822, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 135.4375, "completions/min_length": 46.4, "completions/max_length": 698.4, "completions/clipped_ratio": 0.0015625, "kl": 0.10390158, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.71794872, "global_step/max_steps": "1605/2000", "percentage": "80.25%", "elapsed_time": "15h 46m 52s", "remaining_time": "3h 53m 1s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028251} {"loss": 0.004261, "grad_norm": 0.98910838, "learning_rate": 1e-07, "reward": 3.246875, "reward_std": 0.67859879, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1996875, "rewards/MazeReward/std": 0.25292297, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.2, "completions/min_length": 50.4, "completions/max_length": 362.8, "completions/clipped_ratio": 0.0, "kl": 0.10653561, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.76068376, "global_step/max_steps": "1610/2000", "percentage": "80.50%", "elapsed_time": "15h 49m 42s", "remaining_time": "3h 50m 3s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028254} {"loss": 0.00466516, "grad_norm": 0.62807361, "learning_rate": 1e-07, "reward": 3.175, "reward_std": 0.45869364, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1925, "rewards/MazeReward/std": 0.23270583, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.7015625, "completions/min_length": 46.2, "completions/max_length": 367.0, "completions/clipped_ratio": 0.0, "kl": 0.11661677, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.8034188, "global_step/max_steps": "1615/2000", "percentage": "80.75%", "elapsed_time": "15h 52m 35s", "remaining_time": "3h 47m 5s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028256} {"loss": 0.00414684, "grad_norm": 1.12299574, "learning_rate": 1e-07, "reward": 3.365625, "reward_std": 0.58509004, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.2115625, "rewards/MazeReward/std": 0.24423626, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.015625, "completions/min_length": 47.8, "completions/max_length": 389.8, "completions/clipped_ratio": 0.0, "kl": 0.10365096, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.84615385, "global_step/max_steps": "1620/2000", "percentage": "81.00%", "elapsed_time": "15h 55m 30s", "remaining_time": "3h 44m 7s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028257} {"loss": 0.00451275, "grad_norm": 1.30513445, "learning_rate": 9e-08, "reward": 3.371875, "reward_std": 0.62071723, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.21218751, "rewards/MazeReward/std": 0.24975141, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.1359375, "completions/min_length": 49.0, "completions/max_length": 431.4, "completions/clipped_ratio": 0.0, "kl": 0.11281033, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.88888889, "global_step/max_steps": "1625/2000", "percentage": "81.25%", "elapsed_time": "15h 58m 26s", "remaining_time": "3h 41m 10s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028257} {"loss": 0.00427096, "grad_norm": 0.40871228, "learning_rate": 9e-08, "reward": 2.965625, "reward_std": 0.38373722, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.1715625, "rewards/MazeReward/std": 0.2189858, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.565625, "completions/min_length": 47.0, "completions/max_length": 370.0, "completions/clipped_ratio": 0.0, "kl": 0.1067576, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.93162393, "global_step/max_steps": "1630/2000", "percentage": "81.50%", "elapsed_time": "16h 1m 20s", "remaining_time": "3h 38m 13s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028259} {"loss": 0.00437529, "grad_norm": 0.98538783, "learning_rate": 9e-08, "reward": 2.846875, "reward_std": 0.34098189, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.1596875, "rewards/MazeReward/std": 0.1921182, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.6375, "completions/min_length": 48.8, "completions/max_length": 353.8, "completions/clipped_ratio": 0.0, "kl": 0.10937571, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.97435897, "global_step/max_steps": "1635/2000", "percentage": "81.75%", "elapsed_time": "16h 4m 11s", "remaining_time": "3h 35m 14s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028262} {"loss": 0.00428175, "grad_norm": 0.6220138, "learning_rate": 9e-08, "reward": 3.50625, "reward_std": 0.54771358, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.225625, "rewards/MazeReward/std": 0.24167807, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.046875, "completions/min_length": 48.0, "completions/max_length": 426.0, "completions/clipped_ratio": 0.0, "kl": 0.10704007, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.01709402, "global_step/max_steps": "1640/2000", "percentage": "82.00%", "elapsed_time": "16h 7m 10s", "remaining_time": "3h 32m 18s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028261} {"loss": 0.00441168, "grad_norm": 1.18072235, "learning_rate": 8e-08, "reward": 3.28125, "reward_std": 0.53376545, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.20312501, "rewards/MazeReward/std": 0.22464994, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.753125, "completions/min_length": 49.6, "completions/max_length": 339.4, "completions/clipped_ratio": 0.0, "kl": 0.11027585, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.05982906, "global_step/max_steps": "1645/2000", "percentage": "82.25%", "elapsed_time": "16h 10m 0s", "remaining_time": "3h 29m 19s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028264} {"loss": 0.00465377, "grad_norm": 0.66451962, "learning_rate": 8e-08, "reward": 3.659375, "reward_std": 0.41283482, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.2409375, "rewards/MazeReward/std": 0.26526185, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.665625, "completions/min_length": 51.2, "completions/max_length": 317.2, "completions/clipped_ratio": 0.0, "kl": 0.11633408, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.1025641, "global_step/max_steps": "1650/2000", "percentage": "82.50%", "elapsed_time": "16h 12m 48s", "remaining_time": "3h 26m 21s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028269} {"loss": 0.00460227, "grad_norm": 0.05438864, "learning_rate": 8e-08, "reward": 3.1125, "reward_std": 0.1789622, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.18625, "rewards/MazeReward/std": 0.21130215, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.334375, "completions/min_length": 47.4, "completions/max_length": 333.6, "completions/clipped_ratio": 0.0, "kl": 0.11503907, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.14529915, "global_step/max_steps": "1655/2000", "percentage": "82.75%", "elapsed_time": "16h 15m 37s", "remaining_time": "3h 23m 22s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028272} {"loss": 0.00429397, "grad_norm": 0.43688587, "learning_rate": 8e-08, "reward": 3.446875, "reward_std": 0.65368781, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.2196875, "rewards/MazeReward/std": 0.25045114, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.6921875, "completions/min_length": 46.2, "completions/max_length": 335.0, "completions/clipped_ratio": 0.0, "kl": 0.10734037, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.18803419, "global_step/max_steps": "1660/2000", "percentage": "83.00%", "elapsed_time": "16h 18m 24s", "remaining_time": "3h 20m 23s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028277} {"loss": 0.00418952, "grad_norm": 0.84480987, "learning_rate": 7e-08, "reward": 2.959375, "reward_std": 0.48555052, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.1709375, "rewards/MazeReward/std": 0.20453032, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.55625, "completions/min_length": 47.2, "completions/max_length": 381.4, "completions/clipped_ratio": 0.0, "kl": 0.10472418, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.23076923, "global_step/max_steps": "1665/2000", "percentage": "83.25%", "elapsed_time": "16h 21m 16s", "remaining_time": "3h 17m 26s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02828} {"loss": 0.00398587, "grad_norm": 0.52296879, "learning_rate": 7e-08, "reward": 3.35, "reward_std": 0.49397459, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.21000001, "rewards/MazeReward/std": 0.23409393, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.9421875, "completions/min_length": 48.6, "completions/max_length": 360.0, "completions/clipped_ratio": 0.0, "kl": 0.09963854, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.27350427, "global_step/max_steps": "1670/2000", "percentage": "83.50%", "elapsed_time": "16h 24m 3s", "remaining_time": "3h 14m 27s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028284} {"loss": 0.00410234, "grad_norm": 1.07204803, "learning_rate": 7e-08, "reward": 3.6546875, "reward_std": 0.78844584, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.24062501, "rewards/MazeReward/std": 0.25811377, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.346875, "completions/min_length": 47.8, "completions/max_length": 700.4, "completions/clipped_ratio": 0.0015625, "kl": 0.10253564, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.31623932, "global_step/max_steps": "1675/2000", "percentage": "83.75%", "elapsed_time": "16h 27m 33s", "remaining_time": "3h 11m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028268} {"loss": 0.004236, "grad_norm": 0.52872329, "learning_rate": 7e-08, "reward": 3.190625, "reward_std": 0.58825822, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.1940625, "rewards/MazeReward/std": 0.24067834, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.48125, "completions/min_length": 48.6, "completions/max_length": 371.2, "completions/clipped_ratio": 0.0, "kl": 0.10590211, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.35897436, "global_step/max_steps": "1680/2000", "percentage": "84.00%", "elapsed_time": "16h 30m 24s", "remaining_time": "3h 8m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028271} {"loss": 0.00436282, "grad_norm": 0.89300059, "learning_rate": 7e-08, "reward": 3.359375, "reward_std": 0.74673238, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.2109375, "rewards/MazeReward/std": 0.25135612, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.9703125, "completions/min_length": 49.8, "completions/max_length": 440.4, "completions/clipped_ratio": 0.0, "kl": 0.10905647, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.4017094, "global_step/max_steps": "1685/2000", "percentage": "84.25%", "elapsed_time": "16h 33m 23s", "remaining_time": "3h 5m 42s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02827} {"loss": 0.00414674, "grad_norm": 0.88429691, "learning_rate": 6e-08, "reward": 3.1625, "reward_std": 0.58146068, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.19125001, "rewards/MazeReward/std": 0.2381917, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.4796875, "completions/min_length": 48.0, "completions/max_length": 401.0, "completions/clipped_ratio": 0.0, "kl": 0.10364812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.44444444, "global_step/max_steps": "1690/2000", "percentage": "84.50%", "elapsed_time": "16h 36m 17s", "remaining_time": "3h 2m 45s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028272} {"loss": 0.00459654, "grad_norm": 0.73080751, "learning_rate": 6e-08, "reward": 3.821875, "reward_std": 0.93921335, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.25718749, "rewards/MazeReward/std": 0.2882909, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.4609375, "completions/min_length": 52.4, "completions/max_length": 376.6, "completions/clipped_ratio": 0.0, "kl": 0.11490622, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.48717949, "global_step/max_steps": "1695/2000", "percentage": "84.75%", "elapsed_time": "16h 39m 8s", "remaining_time": "2h 59m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028274} {"loss": 0.00394969, "grad_norm": 0.52036567, "learning_rate": 6e-08, "reward": 3.175, "reward_std": 0.47585676, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.1925, "rewards/MazeReward/std": 0.22184069, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.490625, "completions/min_length": 46.8, "completions/max_length": 327.2, "completions/clipped_ratio": 0.0, "kl": 0.09874771, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.52991453, "global_step/max_steps": "1700/2000", "percentage": "85.00%", "elapsed_time": "16h 41m 52s", "remaining_time": "2h 56m 48s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02828} {"loss": 0.00435583, "grad_norm": 0.66970322, "learning_rate": 6e-08, "reward": 3.5125, "reward_std": 0.4158942, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.22625, "rewards/MazeReward/std": 0.24999509, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.51875, "completions/min_length": 46.4, "completions/max_length": 342.8, "completions/clipped_ratio": 0.0, "kl": 0.10887363, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.57264957, "global_step/max_steps": "1705/2000", "percentage": "85.25%", "elapsed_time": "16h 45m 49s", "remaining_time": "2h 54m 1s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028252} {"loss": 0.00442268, "grad_norm": 0.7010707, "learning_rate": 6e-08, "reward": 3.278125, "reward_std": 0.38592587, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.2028125, "rewards/MazeReward/std": 0.23284581, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.8578125, "completions/min_length": 48.0, "completions/max_length": 348.2, "completions/clipped_ratio": 0.0, "kl": 0.11055122, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.61538462, "global_step/max_steps": "1710/2000", "percentage": "85.50%", "elapsed_time": "16h 48m 39s", "remaining_time": "2h 51m 3s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028255} {"loss": 0.0043262, "grad_norm": 0.66848991, "learning_rate": 5e-08, "reward": 3.196875, "reward_std": 0.60697131, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1946875, "rewards/MazeReward/std": 0.23656202, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.209375, "completions/min_length": 46.4, "completions/max_length": 361.2, "completions/clipped_ratio": 0.0, "kl": 0.10815434, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.65811966, "global_step/max_steps": "1715/2000", "percentage": "85.75%", "elapsed_time": "16h 51m 28s", "remaining_time": "2h 48m 5s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028259} {"loss": 0.00432272, "grad_norm": 0.79444248, "learning_rate": 5e-08, "reward": 3.403125, "reward_std": 0.87900053, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.2153125, "rewards/MazeReward/std": 0.27138483, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.9078125, "completions/min_length": 48.6, "completions/max_length": 348.0, "completions/clipped_ratio": 0.0, "kl": 0.10805549, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.7008547, "global_step/max_steps": "1720/2000", "percentage": "86.00%", "elapsed_time": "16h 54m 16s", "remaining_time": "2h 45m 6s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028263} {"loss": 0.00431432, "grad_norm": 0.83119644, "learning_rate": 5e-08, "reward": 3.321875, "reward_std": 0.47051497, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.20718751, "rewards/MazeReward/std": 0.23688737, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.675, "completions/min_length": 47.0, "completions/max_length": 353.2, "completions/clipped_ratio": 0.0, "kl": 0.10785717, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.74358974, "global_step/max_steps": "1725/2000", "percentage": "86.25%", "elapsed_time": "16h 57m 4s", "remaining_time": "2h 42m 8s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028267} {"loss": 0.00409688, "grad_norm": 1.24123659, "learning_rate": 5e-08, "reward": 3.59375, "reward_std": 0.75415707, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.234375, "rewards/MazeReward/std": 0.28494342, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.0015625, "completions/min_length": 48.2, "completions/max_length": 362.4, "completions/clipped_ratio": 0.0, "kl": 0.1024201, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.78632479, "global_step/max_steps": "1730/2000", "percentage": "86.50%", "elapsed_time": "16h 59m 56s", "remaining_time": "2h 39m 10s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02827} {"loss": 0.00435945, "grad_norm": 1.11050874, "learning_rate": 5e-08, "reward": 3.5375, "reward_std": 0.7639246, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.22875, "rewards/MazeReward/std": 0.24069325, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.9265625, "completions/min_length": 48.4, "completions/max_length": 347.4, "completions/clipped_ratio": 0.0, "kl": 0.10899119, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.82905983, "global_step/max_steps": "1735/2000", "percentage": "86.75%", "elapsed_time": "17h 2m 42s", "remaining_time": "2h 36m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028274} {"loss": 0.00439256, "grad_norm": 0.7388593, "learning_rate": 5e-08, "reward": 3.678125, "reward_std": 0.7469539, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.2428125, "rewards/MazeReward/std": 0.29002054, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.603125, "completions/min_length": 50.2, "completions/max_length": 386.2, "completions/clipped_ratio": 0.0, "kl": 0.10980641, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.87179487, "global_step/max_steps": "1740/2000", "percentage": "87.00%", "elapsed_time": "17h 5m 36s", "remaining_time": "2h 33m 15s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028276} {"loss": 0.00437613, "grad_norm": 0.43934121, "learning_rate": 4e-08, "reward": 3.409375, "reward_std": 0.28987479, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.2159375, "rewards/MazeReward/std": 0.24109179, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2125, "completions/min_length": 46.8, "completions/max_length": 347.4, "completions/clipped_ratio": 0.0, "kl": 0.10940547, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.91452991, "global_step/max_steps": "1745/2000", "percentage": "87.25%", "elapsed_time": "17h 8m 25s", "remaining_time": "2h 30m 17s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02828} {"loss": 0.00450138, "grad_norm": 1.18788556, "learning_rate": 4e-08, "reward": 3.640625, "reward_std": 0.87815031, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.23906251, "rewards/MazeReward/std": 0.29237211, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.225, "completions/min_length": 48.0, "completions/max_length": 315.2, "completions/clipped_ratio": 0.0, "kl": 0.11253669, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.95726496, "global_step/max_steps": "1750/2000", "percentage": "87.50%", "elapsed_time": "17h 11m 8s", "remaining_time": "2h 27m 18s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028286} {"loss": 0.00429477, "grad_norm": 0.8092897, "learning_rate": 4e-08, "reward": 3.684375, "reward_std": 0.71900274, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.2434375, "rewards/MazeReward/std": 0.26887894, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.45625, "completions/min_length": 44.2, "completions/max_length": 348.2, "completions/clipped_ratio": 0.0, "kl": 0.10736672, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.0, "global_step/max_steps": "1755/2000", "percentage": "87.75%", "elapsed_time": "17h 13m 57s", "remaining_time": "2h 24m 20s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02829} {"loss": 0.00450262, "grad_norm": 0.58309909, "learning_rate": 4e-08, "reward": 3.46875, "reward_std": 0.33688004, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.22187501, "rewards/MazeReward/std": 0.2486949, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.8734375, "completions/min_length": 44.6, "completions/max_length": 399.0, "completions/clipped_ratio": 0.0, "kl": 0.11255799, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.04273504, "global_step/max_steps": "1760/2000", "percentage": "88.00%", "elapsed_time": "17h 16m 49s", "remaining_time": "2h 21m 23s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028292} {"loss": 0.00436579, "grad_norm": 0.43282704, "learning_rate": 4e-08, "reward": 4.06699219, "reward_std": 0.75645022, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.28187501, "rewards/MazeReward/std": 0.27423061, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 128.025, "completions/min_length": 45.0, "completions/max_length": 705.2, "completions/clipped_ratio": 0.0015625, "kl": 0.1091362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.08547009, "global_step/max_steps": "1765/2000", "percentage": "88.25%", "elapsed_time": "17h 20m 18s", "remaining_time": "2h 18m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028277} {"loss": 0.00434699, "grad_norm": 0.81299525, "learning_rate": 4e-08, "reward": 3.284375, "reward_std": 0.66719788, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.2034375, "rewards/MazeReward/std": 0.24995078, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.975, "completions/min_length": 47.4, "completions/max_length": 363.6, "completions/clipped_ratio": 0.0, "kl": 0.10865908, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.12820513, "global_step/max_steps": "1770/2000", "percentage": "88.50%", "elapsed_time": "17h 23m 7s", "remaining_time": "2h 15m 32s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02828} {"loss": 0.00419814, "grad_norm": 1.10870664, "learning_rate": 3e-08, "reward": 3.5125, "reward_std": 0.40840748, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.22625001, "rewards/MazeReward/std": 0.24444263, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.7359375, "completions/min_length": 50.6, "completions/max_length": 339.4, "completions/clipped_ratio": 0.0, "kl": 0.1049537, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.17094017, "global_step/max_steps": "1775/2000", "percentage": "88.75%", "elapsed_time": "17h 25m 55s", "remaining_time": "2h 12m 34s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028285} {"loss": 0.00424328, "grad_norm": 0.76140628, "learning_rate": 3e-08, "reward": 3.178125, "reward_std": 0.47218041, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.1928125, "rewards/MazeReward/std": 0.23438347, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.8203125, "completions/min_length": 47.2, "completions/max_length": 351.2, "completions/clipped_ratio": 0.0, "kl": 0.1060746, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.21367521, "global_step/max_steps": "1780/2000", "percentage": "89.00%", "elapsed_time": "17h 28m 45s", "remaining_time": "2h 9m 37s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028288} {"loss": 0.00426247, "grad_norm": 0.61274894, "learning_rate": 3e-08, "reward": 3.496875, "reward_std": 0.36138367, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.2246875, "rewards/MazeReward/std": 0.23243287, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.6671875, "completions/min_length": 49.4, "completions/max_length": 389.2, "completions/clipped_ratio": 0.0, "kl": 0.10655979, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.25641026, "global_step/max_steps": "1785/2000", "percentage": "89.25%", "elapsed_time": "17h 31m 38s", "remaining_time": "2h 6m 40s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028289} {"loss": 0.00461703, "grad_norm": 0.967511, "learning_rate": 3e-08, "reward": 3.2, "reward_std": 0.3629581, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.195, "rewards/MazeReward/std": 0.24108279, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.134375, "completions/min_length": 50.0, "completions/max_length": 376.4, "completions/clipped_ratio": 0.0, "kl": 0.11542836, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.2991453, "global_step/max_steps": "1790/2000", "percentage": "89.50%", "elapsed_time": "17h 34m 30s", "remaining_time": "2h 3m 42s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028291} {"loss": 0.00428568, "grad_norm": 0.57965723, "learning_rate": 3e-08, "reward": 3.521875, "reward_std": 0.49462694, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.22718751, "rewards/MazeReward/std": 0.26889726, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.8078125, "completions/min_length": 46.2, "completions/max_length": 391.2, "completions/clipped_ratio": 0.0, "kl": 0.1071319, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.34188034, "global_step/max_steps": "1795/2000", "percentage": "89.75%", "elapsed_time": "17h 37m 23s", "remaining_time": "2h 0m 45s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028293} {"loss": 0.00392963, "grad_norm": 0.05075725, "learning_rate": 3e-08, "reward": 3.671875, "reward_std": 0.53627786, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.24218751, "rewards/MazeReward/std": 0.24283812, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.8359375, "completions/min_length": 49.2, "completions/max_length": 319.8, "completions/clipped_ratio": 0.0, "kl": 0.09822466, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.38461538, "global_step/max_steps": "1800/2000", "percentage": "90.00%", "elapsed_time": "17h 40m 8s", "remaining_time": "1h 57m 47s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028298} {"loss": 0.00457508, "grad_norm": 0.70722938, "learning_rate": 3e-08, "reward": 3.53125, "reward_std": 0.45395667, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.228125, "rewards/MazeReward/std": 0.25795353, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.79375, "completions/min_length": 45.6, "completions/max_length": 334.6, "completions/clipped_ratio": 0.0, "kl": 0.11436029, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.42735043, "global_step/max_steps": "1805/2000", "percentage": "90.25%", "elapsed_time": "17h 44m 4s", "remaining_time": "1h 54m 57s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028272} {"loss": 0.00470421, "grad_norm": 1.13851853, "learning_rate": 2e-08, "reward": 4.0, "reward_std": 0.65198538, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.27500001, "rewards/MazeReward/std": 0.28275522, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.3171875, "completions/min_length": 51.8, "completions/max_length": 322.8, "completions/clipped_ratio": 0.0, "kl": 0.11759205, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.47008547, "global_step/max_steps": "1810/2000", "percentage": "90.50%", "elapsed_time": "17h 46m 50s", "remaining_time": "1h 51m 59s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028277} {"loss": 0.0042262, "grad_norm": 0.95641712, "learning_rate": 2e-08, "reward": 3.725, "reward_std": 0.57202956, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.2475, "rewards/MazeReward/std": 0.25868377, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.0671875, "completions/min_length": 47.2, "completions/max_length": 311.2, "completions/clipped_ratio": 0.0, "kl": 0.10564615, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.51282051, "global_step/max_steps": "1815/2000", "percentage": "90.75%", "elapsed_time": "17h 49m 34s", "remaining_time": "1h 49m 1s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028282} {"loss": 0.00419338, "grad_norm": 1.24065299, "learning_rate": 2e-08, "reward": 3.56875, "reward_std": 0.59791383, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.231875, "rewards/MazeReward/std": 0.26024834, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2984375, "completions/min_length": 44.0, "completions/max_length": 330.2, "completions/clipped_ratio": 0.0, "kl": 0.1048233, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.55555556, "global_step/max_steps": "1820/2000", "percentage": "91.00%", "elapsed_time": "17h 52m 21s", "remaining_time": "1h 46m 3s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028287} {"loss": 0.00431981, "grad_norm": 0.70175194, "learning_rate": 2e-08, "reward": 3.378125, "reward_std": 0.43282521, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.2128125, "rewards/MazeReward/std": 0.23691976, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.415625, "completions/min_length": 44.6, "completions/max_length": 315.0, "completions/clipped_ratio": 0.0, "kl": 0.10801099, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.5982906, "global_step/max_steps": "1825/2000", "percentage": "91.25%", "elapsed_time": "17h 55m 3s", "remaining_time": "1h 43m 5s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028293} {"loss": 0.0042673, "grad_norm": 0.66322774, "learning_rate": 2e-08, "reward": 3.559375, "reward_std": 0.50121611, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.2309375, "rewards/MazeReward/std": 0.27313117, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.8125, "completions/min_length": 49.2, "completions/max_length": 344.6, "completions/clipped_ratio": 0.0, "kl": 0.10668526, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.64102564, "global_step/max_steps": "1830/2000", "percentage": "91.50%", "elapsed_time": "17h 57m 51s", "remaining_time": "1h 40m 7s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028297} {"loss": 0.00463317, "grad_norm": 0.5135443, "learning_rate": 2e-08, "reward": 3.34375, "reward_std": 0.49532705, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.20937501, "rewards/MazeReward/std": 0.25516411, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.003125, "completions/min_length": 47.6, "completions/max_length": 348.8, "completions/clipped_ratio": 0.0, "kl": 0.11582731, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.68376068, "global_step/max_steps": "1835/2000", "percentage": "91.75%", "elapsed_time": "18h 0m 40s", "remaining_time": "1h 37m 10s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.0283} {"loss": 0.00423024, "grad_norm": 1.00164973, "learning_rate": 2e-08, "reward": 3.09667969, "reward_std": 0.30462036, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.1846875, "rewards/MazeReward/std": 0.21719041, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 119.6359375, "completions/min_length": 47.0, "completions/max_length": 300.4, "completions/clipped_ratio": 0.0, "kl": 0.10574633, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.72649573, "global_step/max_steps": "1840/2000", "percentage": "92.00%", "elapsed_time": "18h 3m 24s", "remaining_time": "1h 34m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028306} {"loss": 0.00422583, "grad_norm": 0.51744544, "learning_rate": 2e-08, "reward": 3.515625, "reward_std": 0.68826342, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.2265625, "rewards/MazeReward/std": 0.27320862, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.990625, "completions/min_length": 47.6, "completions/max_length": 424.6, "completions/clipped_ratio": 0.0, "kl": 0.10564749, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.76923077, "global_step/max_steps": "1845/2000", "percentage": "92.25%", "elapsed_time": "18h 6m 21s", "remaining_time": "1h 31m 15s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028306} {"loss": 0.00401286, "grad_norm": 0.82555361, "learning_rate": 2e-08, "reward": 3.371875, "reward_std": 0.75214228, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.2121875, "rewards/MazeReward/std": 0.25288013, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.2875, "completions/min_length": 48.2, "completions/max_length": 341.8, "completions/clipped_ratio": 0.0, "kl": 0.1003047, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.81196581, "global_step/max_steps": "1850/2000", "percentage": "92.50%", "elapsed_time": "18h 9m 9s", "remaining_time": "1h 28m 18s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028309} {"loss": 0.00422137, "grad_norm": 0.81280991, "learning_rate": 1e-08, "reward": 3.259375, "reward_std": 0.31027054, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.20093751, "rewards/MazeReward/std": 0.23117016, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.8, "completions/min_length": 50.2, "completions/max_length": 343.4, "completions/clipped_ratio": 0.0, "kl": 0.10551224, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.85470085, "global_step/max_steps": "1855/2000", "percentage": "92.75%", "elapsed_time": "18h 11m 55s", "remaining_time": "1h 25m 21s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028314} {"loss": 0.00439553, "grad_norm": 1.10914205, "learning_rate": 1e-08, "reward": 4.075, "reward_std": 0.67030721, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.2825, "rewards/MazeReward/std": 0.27602745, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.9515625, "completions/min_length": 47.0, "completions/max_length": 344.8, "completions/clipped_ratio": 0.0, "kl": 0.10986832, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.8974359, "global_step/max_steps": "1860/2000", "percentage": "93.00%", "elapsed_time": "18h 14m 45s", "remaining_time": "1h 22m 24s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028317} {"loss": 0.00410199, "grad_norm": 0.8145535, "learning_rate": 1e-08, "reward": 3.7375, "reward_std": 0.52235777, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.24875, "rewards/MazeReward/std": 0.258267, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.453125, "completions/min_length": 49.8, "completions/max_length": 424.2, "completions/clipped_ratio": 0.0, "kl": 0.10255065, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.94017094, "global_step/max_steps": "1865/2000", "percentage": "93.25%", "elapsed_time": "18h 17m 41s", "remaining_time": "1h 19m 27s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028317} {"loss": 0.0044011, "grad_norm": 0.06424906, "learning_rate": 1e-08, "reward": 3.38125, "reward_std": 0.59683893, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.21312501, "rewards/MazeReward/std": 0.28499728, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.9578125, "completions/min_length": 46.6, "completions/max_length": 382.6, "completions/clipped_ratio": 0.0, "kl": 0.11002746, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.98290598, "global_step/max_steps": "1870/2000", "percentage": "93.50%", "elapsed_time": "18h 20m 31s", "remaining_time": "1h 16m 30s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02832} {"loss": 0.00409554, "grad_norm": 0.64873145, "learning_rate": 1e-08, "reward": 3.975, "reward_std": 0.84134539, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.27250001, "rewards/MazeReward/std": 0.30401532, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.4640625, "completions/min_length": 49.2, "completions/max_length": 316.6, "completions/clipped_ratio": 0.0, "kl": 0.10238274, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.02564103, "global_step/max_steps": "1875/2000", "percentage": "93.75%", "elapsed_time": "18h 23m 19s", "remaining_time": "1h 13m 33s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028323} {"loss": 0.00424391, "grad_norm": 0.49436098, "learning_rate": 1e-08, "reward": 3.334375, "reward_std": 0.40421188, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.2084375, "rewards/MazeReward/std": 0.25012314, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.6984375, "completions/min_length": 47.6, "completions/max_length": 422.6, "completions/clipped_ratio": 0.0, "kl": 0.10609785, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.06837607, "global_step/max_steps": "1880/2000", "percentage": "94.00%", "elapsed_time": "18h 26m 16s", "remaining_time": "1h 10m 36s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028323} {"loss": 0.00435693, "grad_norm": 0.50843692, "learning_rate": 1e-08, "reward": 3.59375, "reward_std": 0.63147125, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.23437499, "rewards/MazeReward/std": 0.27095603, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.1859375, "completions/min_length": 50.0, "completions/max_length": 351.0, "completions/clipped_ratio": 0.0, "kl": 0.10891414, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.11111111, "global_step/max_steps": "1885/2000", "percentage": "94.25%", "elapsed_time": "18h 29m 7s", "remaining_time": "1h 7m 39s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028326} {"loss": 0.00407889, "grad_norm": 0.66645639, "learning_rate": 1e-08, "reward": 3.4625, "reward_std": 0.47230331, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.22125001, "rewards/MazeReward/std": 0.24453461, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.0859375, "completions/min_length": 47.0, "completions/max_length": 387.4, "completions/clipped_ratio": 0.0, "kl": 0.10197193, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.15384615, "global_step/max_steps": "1890/2000", "percentage": "94.50%", "elapsed_time": "18h 32m 1s", "remaining_time": "1h 4m 43s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028327} {"loss": 0.00448702, "grad_norm": 0.05352815, "learning_rate": 1e-08, "reward": 3.478125, "reward_std": 0.34457637, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.2228125, "rewards/MazeReward/std": 0.24724717, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.35, "completions/min_length": 49.4, "completions/max_length": 334.4, "completions/clipped_ratio": 0.0, "kl": 0.11215386, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.1965812, "global_step/max_steps": "1895/2000", "percentage": "94.75%", "elapsed_time": "18h 34m 46s", "remaining_time": "1h 1m 46s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028331} {"loss": 0.00464495, "grad_norm": 0.77465706, "learning_rate": 1e-08, "reward": 3.440625, "reward_std": 0.73813897, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.2190625, "rewards/MazeReward/std": 0.28290378, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.7515625, "completions/min_length": 49.8, "completions/max_length": 321.6, "completions/clipped_ratio": 0.0, "kl": 0.11612661, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.23931624, "global_step/max_steps": "1900/2000", "percentage": "95.00%", "elapsed_time": "18h 37m 31s", "remaining_time": "58m 49s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028337} {"loss": 0.00434216, "grad_norm": 0.64639534, "learning_rate": 1e-08, "reward": 3.728125, "reward_std": 0.35591975, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.24781251, "rewards/MazeReward/std": 0.23685934, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.953125, "completions/min_length": 46.0, "completions/max_length": 318.4, "completions/clipped_ratio": 0.0, "kl": 0.10854203, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.28205128, "global_step/max_steps": "1905/2000", "percentage": "95.25%", "elapsed_time": "18h 41m 25s", "remaining_time": "55m 55s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028312} {"loss": 0.00470483, "grad_norm": 1.17934864, "learning_rate": 1e-08, "reward": 3.4875, "reward_std": 0.77448367, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.22375, "rewards/MazeReward/std": 0.27049732, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.671875, "completions/min_length": 48.6, "completions/max_length": 375.2, "completions/clipped_ratio": 0.0, "kl": 0.11762103, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.32478632, "global_step/max_steps": "1910/2000", "percentage": "95.50%", "elapsed_time": "18h 44m 18s", "remaining_time": "52m 58s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028314} {"loss": 0.00471613, "grad_norm": 0.6733017, "learning_rate": 0.0, "reward": 3.565625, "reward_std": 0.53627394, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.2315625, "rewards/MazeReward/std": 0.28864869, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.8703125, "completions/min_length": 44.2, "completions/max_length": 383.0, "completions/clipped_ratio": 0.0, "kl": 0.1178839, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.36752137, "global_step/max_steps": "1915/2000", "percentage": "95.75%", "elapsed_time": "18h 47m 9s", "remaining_time": "50m 1s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028316} {"loss": 0.0045238, "grad_norm": 1.03823708, "learning_rate": 0.0, "reward": 3.396875, "reward_std": 0.80381098, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.2146875, "rewards/MazeReward/std": 0.27168379, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.578125, "completions/min_length": 52.8, "completions/max_length": 324.8, "completions/clipped_ratio": 0.0, "kl": 0.11309505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.41025641, "global_step/max_steps": "1920/2000", "percentage": "96.00%", "elapsed_time": "18h 49m 54s", "remaining_time": "47m 4s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028321} {"loss": 0.004393, "grad_norm": 0.82571393, "learning_rate": 0.0, "reward": 3.95625, "reward_std": 0.59204166, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.270625, "rewards/MazeReward/std": 0.28637204, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.0703125, "completions/min_length": 45.2, "completions/max_length": 327.6, "completions/clipped_ratio": 0.0, "kl": 0.10980163, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.45299145, "global_step/max_steps": "1925/2000", "percentage": "96.25%", "elapsed_time": "18h 52m 38s", "remaining_time": "44m 7s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028326} {"loss": 0.00415186, "grad_norm": 1.07424693, "learning_rate": 0.0, "reward": 3.60449219, "reward_std": 0.47861121, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.235625, "rewards/MazeReward/std": 0.25146309, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767767, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 127.8203125, "completions/min_length": 46.8, "completions/max_length": 695.0, "completions/clipped_ratio": 0.0015625, "kl": 0.10377972, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.4957265, "global_step/max_steps": "1930/2000", "percentage": "96.50%", "elapsed_time": "18h 56m 7s", "remaining_time": "41m 12s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028313} {"loss": 0.00408892, "grad_norm": 0.61189488, "learning_rate": 0.0, "reward": 3.6875, "reward_std": 0.47385878, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.24375001, "rewards/MazeReward/std": 0.23270266, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.9828125, "completions/min_length": 44.8, "completions/max_length": 328.4, "completions/clipped_ratio": 0.0, "kl": 0.10219636, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.53846154, "global_step/max_steps": "1935/2000", "percentage": "96.75%", "elapsed_time": "18h 58m 53s", "remaining_time": "38m 15s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028317} {"loss": 0.00434509, "grad_norm": 1.27841144, "learning_rate": 0.0, "reward": 3.659375, "reward_std": 0.73014527, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.24093751, "rewards/MazeReward/std": 0.25682282, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.25625, "completions/min_length": 47.6, "completions/max_length": 349.4, "completions/clipped_ratio": 0.0, "kl": 0.10859407, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.58119658, "global_step/max_steps": "1940/2000", "percentage": "97.00%", "elapsed_time": "19h 1m 41s", "remaining_time": "35m 18s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028321} {"loss": 0.00422232, "grad_norm": 0.98987307, "learning_rate": 0.0, "reward": 3.5125, "reward_std": 0.44113067, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.22625, "rewards/MazeReward/std": 0.24874129, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.1, "completions/min_length": 48.8, "completions/max_length": 337.2, "completions/clipped_ratio": 0.0, "kl": 0.1055587, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.62393162, "global_step/max_steps": "1945/2000", "percentage": "97.25%", "elapsed_time": "19h 4m 28s", "remaining_time": "32m 21s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028324} {"loss": 0.00399487, "grad_norm": 1.02177389, "learning_rate": 0.0, "reward": 3.428125, "reward_std": 0.54930134, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.2178125, "rewards/MazeReward/std": 0.21325217, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.5265625, "completions/min_length": 45.8, "completions/max_length": 446.0, "completions/clipped_ratio": 0.0, "kl": 0.09988251, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.66666667, "global_step/max_steps": "1950/2000", "percentage": "97.50%", "elapsed_time": "19h 7m 29s", "remaining_time": "29m 25s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028323} {"loss": 0.00418646, "grad_norm": 0.58846183, "learning_rate": 0.0, "reward": 3.60605469, "reward_std": 0.5122002, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.23562501, "rewards/MazeReward/std": 0.26041761, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 121.23125, "completions/min_length": 47.0, "completions/max_length": 349.6, "completions/clipped_ratio": 0.0, "kl": 0.10465789, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.70940171, "global_step/max_steps": "1955/2000", "percentage": "97.75%", "elapsed_time": "19h 10m 19s", "remaining_time": "26m 28s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028325} {"loss": 0.00453209, "grad_norm": 0.60154236, "learning_rate": 0.0, "reward": 3.45, "reward_std": 0.67765831, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.22, "rewards/MazeReward/std": 0.27564533, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.5296875, "completions/min_length": 46.0, "completions/max_length": 337.6, "completions/clipped_ratio": 0.0, "kl": 0.11330186, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.75213675, "global_step/max_steps": "1960/2000", "percentage": "98.00%", "elapsed_time": "19h 13m 6s", "remaining_time": "23m 31s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028329} {"loss": 0.00469129, "grad_norm": 0.40462821, "learning_rate": 0.0, "reward": 3.6125, "reward_std": 0.24019813, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.23625002, "rewards/MazeReward/std": 0.22056612, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.4421875, "completions/min_length": 43.2, "completions/max_length": 338.2, "completions/clipped_ratio": 0.0, "kl": 0.11728158, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.79487179, "global_step/max_steps": "1965/2000", "percentage": "98.25%", "elapsed_time": "19h 15m 56s", "remaining_time": "20m 35s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028332} {"loss": 0.00415007, "grad_norm": 0.18848533, "learning_rate": 0.0, "reward": 3.2375, "reward_std": 0.39494048, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.19875, "rewards/MazeReward/std": 0.23839935, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.6640625, "completions/min_length": 48.4, "completions/max_length": 369.6, "completions/clipped_ratio": 0.0, "kl": 0.10375167, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.83760684, "global_step/max_steps": "1970/2000", "percentage": "98.50%", "elapsed_time": "19h 18m 47s", "remaining_time": "17m 38s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028334} {"loss": 0.00433357, "grad_norm": 1.00499124, "learning_rate": 0.0, "reward": 3.70625, "reward_std": 0.76382692, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.245625, "rewards/MazeReward/std": 0.2737706, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.5640625, "completions/min_length": 47.2, "completions/max_length": 347.0, "completions/clipped_ratio": 0.0, "kl": 0.10834184, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.88034188, "global_step/max_steps": "1975/2000", "percentage": "98.75%", "elapsed_time": "19h 21m 38s", "remaining_time": "14m 42s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028336} {"loss": 0.00426852, "grad_norm": 0.54479139, "learning_rate": 0.0, "reward": 3.378125, "reward_std": 0.5172571, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.2128125, "rewards/MazeReward/std": 0.24584325, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.1046875, "completions/min_length": 47.8, "completions/max_length": 335.0, "completions/clipped_ratio": 0.0, "kl": 0.10669711, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.92307692, "global_step/max_steps": "1980/2000", "percentage": "99.00%", "elapsed_time": "19h 24m 25s", "remaining_time": "11m 45s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02834} {"loss": 0.00456485, "grad_norm": 0.7059689, "learning_rate": 0.0, "reward": 3.671875, "reward_std": 0.58404051, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.24218751, "rewards/MazeReward/std": 0.28956305, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.6546875, "completions/min_length": 47.0, "completions/max_length": 344.0, "completions/clipped_ratio": 0.0, "kl": 0.11410398, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.96581197, "global_step/max_steps": "1985/2000", "percentage": "99.25%", "elapsed_time": "19h 27m 15s", "remaining_time": "8m 49s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028343} {"loss": 0.0044481, "grad_norm": 0.84878661, "learning_rate": 0.0, "reward": 3.90605469, "reward_std": 0.45252889, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.265625, "rewards/MazeReward/std": 0.26780886, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.24980469, "rewards/Format/std": 0.00220971, "completions/mean_length": 123.0890625, "completions/min_length": 50.6, "completions/max_length": 316.0, "completions/clipped_ratio": 0.0, "kl": 0.11118856, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 17.00854701, "global_step/max_steps": "1990/2000", "percentage": "99.50%", "elapsed_time": "19h 30m 1s", "remaining_time": "5m 52s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028347} {"loss": 0.00448404, "grad_norm": 0.73125996, "learning_rate": 0.0, "reward": 4.41875, "reward_std": 0.76291482, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.31687501, "rewards/MazeReward/std": 0.30017159, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.8, "completions/min_length": 48.4, "completions/max_length": 336.2, "completions/clipped_ratio": 0.0, "kl": 0.11210197, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 17.05128205, "global_step/max_steps": "1995/2000", "percentage": "99.75%", "elapsed_time": "19h 32m 50s", "remaining_time": "2m 56s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.02835} {"loss": 0.00425379, "grad_norm": 0.45607881, "learning_rate": 0.0, "reward": 3.821875, "reward_std": 0.69849305, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.2571875, "rewards/MazeReward/std": 0.28713129, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.578125, "completions/min_length": 46.0, "completions/max_length": 355.4, "completions/clipped_ratio": 0.0, "kl": 0.10633117, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 17.09401709, "global_step/max_steps": "2000/2000", "percentage": "100.00%", "elapsed_time": "19h 35m 40s", "remaining_time": "0s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028352} {"train_runtime": 70600.3404, "train_samples_per_second": 3.626, "train_steps_per_second": 0.028, "total_flos": 0.0, "train_loss": 0.00348292, "epoch": 17.09401709, "global_step/max_steps": "2000/2000", "percentage": "100.00%", "elapsed_time": "19h 36m 33s", "remaining_time": "0s", "memory(GiB)": 39.0, "train_speed(iter/s)": 0.028331} {"model_parameter_info": "Qwen2_5_VLForConditionalGeneration: 8292.1667M Params (7615.6165M Trainable [91.8411%]), 0.0019M Buffers.", "last_model_checkpoint": "/xfr_ceph_sh/liuchonghan/swiftm/project_dir/GRPO_MAZE/v17-20250929-145319/checkpoint-2000", "best_model_checkpoint": null, "best_metric": null, "global_step": 2000, "log_history": [{"loss": 5.3551048040390015e-09, "grad_norm": 2.0850979009306507, "learning_rate": 1e-08, "reward": 1.2958984375, "reward_std": 0.5146088600158691, "frac_reward_zero_std": 0.25, "rewards/MazeReward/mean": 0.01718750037252903, "rewards/MazeReward/std": 0.05627460405230522, "rewards/MazeFormat/mean": 0.8828125, "rewards/MazeFormat/std": 0.322907418012619, "rewards/Format/mean": 0.2412109375, "rewards/Format/std": 0.04487404227256775, "completions/mean_length": 147.515625, "completions/min_length": 43.0, "completions/max_length": 711.0, "completions/clipped_ratio": 0.0, "kl": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.008547008547008548, "step": 1}, {"loss": 2.261187728436198e-05, "grad_norm": 1.6039673774878993, "learning_rate": 5e-08, "reward": 1.216064453125, "reward_std": 0.4408714398741722, "frac_reward_zero_std": 0.296875, "rewards/MazeReward/mean": 0.009765625698491931, "rewards/MazeReward/std": 0.04561943328008056, "rewards/MazeFormat/mean": 0.88671875, "rewards/MazeFormat/std": 0.31641268730163574, "rewards/Format/mean": 0.231689453125, "rewards/Format/std": 0.06383909657597542, "completions/mean_length": 140.15625, "completions/min_length": 45.75, "completions/max_length": 525.0, "completions/clipped_ratio": 0.0, "kl": 0.0005648559153996757, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.042735042735042736, "step": 5}, {"loss": 3.144462825730443e-05, "grad_norm": 1.473835017850979, "learning_rate": 1e-07, "reward": 1.2603515625, "reward_std": 0.44822131991386416, "frac_reward_zero_std": 0.35, "rewards/MazeReward/mean": 0.011875000246800483, "rewards/MazeReward/std": 0.05252101495862007, "rewards/MazeFormat/mean": 0.9046875, "rewards/MazeFormat/std": 0.2910091012716293, "rewards/Format/mean": 0.2369140625, "rewards/Format/std": 0.054740263521671294, "completions/mean_length": 145.815625, "completions/min_length": 39.0, "completions/max_length": 646.6, "completions/clipped_ratio": 0.0, "kl": 0.0007853303133742884, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08547008547008547, "step": 10}, {"loss": 2.9532110784202813e-05, "grad_norm": 2.0583040142314197, "learning_rate": 1.5e-07, "reward": 1.233203125, "reward_std": 0.3958433210849762, "frac_reward_zero_std": 0.275, "rewards/MazeReward/mean": 0.010000000125728548, "rewards/MazeReward/std": 0.04012826085090637, "rewards/MazeFormat/mean": 0.9, "rewards/MazeFormat/std": 0.2974621653556824, "rewards/Format/mean": 0.233203125, "rewards/Format/std": 0.06156868264079094, "completions/mean_length": 148.54375, "completions/min_length": 42.6, "completions/max_length": 664.4, "completions/clipped_ratio": 0.0, "kl": 0.000737494510030956, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1282051282051282, "step": 15}, {"loss": 3.3766511478461325e-05, "grad_norm": 2.077471563018731, "learning_rate": 2e-07, "reward": 1.288671875, "reward_std": 0.481926828622818, "frac_reward_zero_std": 0.275, "rewards/MazeReward/mean": 0.015937499795109035, "rewards/MazeReward/std": 0.05514752417802811, "rewards/MazeFormat/mean": 0.8953125, "rewards/MazeFormat/std": 0.3048880577087402, "rewards/Format/mean": 0.233984375, "rewards/Format/std": 0.059659218043088914, "completions/mean_length": 141.3703125, "completions/min_length": 47.0, "completions/max_length": 640.8, "completions/clipped_ratio": 0.0, "kl": 0.0008433389681158588, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.17094017094017094, "step": 20}, {"loss": 6.408471963368357e-05, "grad_norm": 1.5830201192961726, "learning_rate": 2.5e-07, "reward": 1.3404296875, "reward_std": 0.375272661447525, "frac_reward_zero_std": 0.4375, "rewards/MazeReward/mean": 0.0162499999627471, "rewards/MazeReward/std": 0.0570610947906971, "rewards/MazeFormat/mean": 0.9359375, "rewards/MazeFormat/std": 0.24299971163272857, "rewards/Format/mean": 0.2419921875, "rewards/Format/std": 0.04338446594774723, "completions/mean_length": 125.58125, "completions/min_length": 39.6, "completions/max_length": 460.2, "completions/clipped_ratio": 0.0, "kl": 0.001601347164978506, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21367521367521367, "step": 25}, {"loss": 0.00016010119579732419, "grad_norm": 1.1955216398068267, "learning_rate": 3e-07, "reward": 1.33046875, "reward_std": 0.3205643713474274, "frac_reward_zero_std": 0.5125, "rewards/MazeReward/mean": 0.013437500456348062, "rewards/MazeReward/std": 0.048377957195043564, "rewards/MazeFormat/mean": 0.953125, "rewards/MazeFormat/std": 0.21053162813186646, "rewards/Format/mean": 0.24296875, "rewards/Format/std": 0.04054766036570072, "completions/mean_length": 120.7453125, "completions/min_length": 40.8, "completions/max_length": 452.8, "completions/clipped_ratio": 0.0, "kl": 0.004001263665850274, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2564102564102564, "step": 30}, {"loss": 0.00046548396348953245, "grad_norm": 1.3153789735954688, "learning_rate": 3.5e-07, "reward": 1.3740234375, "reward_std": 0.23168300092220306, "frac_reward_zero_std": 0.675, "rewards/MazeReward/mean": 0.014687499776482583, "rewards/MazeReward/std": 0.0510710246860981, "rewards/MazeFormat/mean": 0.978125, "rewards/MazeFormat/std": 0.14282614290714263, "rewards/Format/mean": 0.2490234375, "rewards/Format/std": 0.008435053564608098, "completions/mean_length": 106.0109375, "completions/min_length": 36.8, "completions/max_length": 318.6, "completions/clipped_ratio": 0.0, "kl": 0.011635770567227154, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.29914529914529914, "step": 35}, {"loss": 0.0007580962032079696, "grad_norm": 1.5192037310820845, "learning_rate": 4e-07, "reward": 1.3908203125, "reward_std": 0.24733528196811677, "frac_reward_zero_std": 0.7, "rewards/MazeReward/mean": 0.015625, "rewards/MazeReward/std": 0.05470488891005516, "rewards/MazeFormat/mean": 0.9859375, "rewards/MazeFormat/std": 0.10508071333169937, "rewards/Format/mean": 0.2486328125, "rewards/Format/std": 0.013764306530356407, "completions/mean_length": 107.7546875, "completions/min_length": 39.6, "completions/max_length": 410.4, "completions/clipped_ratio": 0.0, "kl": 0.01894672798225656, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3418803418803419, "step": 40}, {"loss": 0.0007857446558773518, "grad_norm": 0.9839755046926172, "learning_rate": 4.5e-07, "reward": 1.377734375, "reward_std": 0.1983001172542572, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.0140625, "rewards/MazeReward/std": 0.04950801432132721, "rewards/MazeFormat/mean": 0.9875, "rewards/MazeFormat/std": 0.09610848724842072, "rewards/Format/mean": 0.249609375, "rewards/Format/std": 0.004419417306780815, "completions/mean_length": 108.796875, "completions/min_length": 41.4, "completions/max_length": 309.4, "completions/clipped_ratio": 0.0, "kl": 0.0196377347339876, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.38461538461538464, "step": 45}, {"loss": 0.0006918612867593765, "grad_norm": 1.2306949614247369, "learning_rate": 5e-07, "reward": 1.358984375, "reward_std": 0.16672340631484986, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.012187499948777259, "rewards/MazeReward/std": 0.04448289349675179, "rewards/MazeFormat/mean": 0.9875, "rewards/MazeFormat/std": 0.09960551857948304, "rewards/Format/mean": 0.249609375, "rewards/Format/std": 0.004419417306780815, "completions/mean_length": 105.29375, "completions/min_length": 40.8, "completions/max_length": 333.0, "completions/clipped_ratio": 0.0, "kl": 0.01729733906686306, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.42735042735042733, "step": 50}, {"loss": 0.0006235324777662754, "grad_norm": 1.0980013379578395, "learning_rate": 5.5e-07, "reward": 1.3109375, "reward_std": 0.12922156751155853, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.006875000079162419, "rewards/MazeReward/std": 0.03500961922109127, "rewards/MazeFormat/mean": 0.9921875, "rewards/MazeFormat/std": 0.06573191285133362, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 104.796875, "completions/min_length": 38.6, "completions/max_length": 287.0, "completions/clipped_ratio": 0.0, "kl": 0.015583514084573835, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4700854700854701, "step": 55}, {"loss": 0.00063182576559484, "grad_norm": 1.3129140370641044, "learning_rate": 6e-07, "reward": 1.3873046875, "reward_std": 0.2064604544546455, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.014062500186264515, "rewards/MazeReward/std": 0.047212396562099454, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 99.0171875, "completions/min_length": 41.0, "completions/max_length": 263.8, "completions/clipped_ratio": 0.0, "kl": 0.015794009924866258, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5128205128205128, "step": 60}, {"loss": 0.0005105869844555855, "grad_norm": 1.0616087417649411, "learning_rate": 6.5e-07, "reward": 1.3916015625, "reward_std": 0.19745952961966395, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.014375000912696122, "rewards/MazeReward/std": 0.04834332019090652, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2494140625, "rewards/Format/std": 0.0066291259601712225, "completions/mean_length": 102.0234375, "completions/min_length": 42.4, "completions/max_length": 372.4, "completions/clipped_ratio": 0.0, "kl": 0.012760270561557263, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5555555555555556, "step": 65}, {"loss": 0.0003107914002612233, "grad_norm": 1.3544242249856053, "learning_rate": 7e-07, "reward": 1.4435546875, "reward_std": 0.31596590876579284, "frac_reward_zero_std": 0.675, "rewards/MazeReward/mean": 0.020312500186264515, "rewards/MazeReward/std": 0.06453245431184769, "rewards/MazeFormat/mean": 0.990625, "rewards/MazeFormat/std": 0.08515809774398804, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 115.3265625, "completions/min_length": 44.0, "completions/max_length": 363.6, "completions/clipped_ratio": 0.0, "kl": 0.007766712602460757, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5982905982905983, "step": 70}, {"loss": 0.000283644744195044, "grad_norm": 1.3602426896267505, "learning_rate": 7.5e-07, "reward": 1.4341796875, "reward_std": 0.23621928095817565, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.01875000037252903, "rewards/MazeReward/std": 0.061052392423152926, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490137964487076, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 115.6609375, "completions/min_length": 46.4, "completions/max_length": 376.6, "completions/clipped_ratio": 0.0, "kl": 0.0070880687271710485, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.6410256410256411, "step": 75}, {"loss": 0.00035074278712272645, "grad_norm": 1.3509089586341816, "learning_rate": 8e-07, "reward": 1.4609375, "reward_std": 0.30132956802845, "frac_reward_zero_std": 0.7, "rewards/MazeReward/mean": 0.021562500018626453, "rewards/MazeReward/std": 0.0627759762108326, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257904887199402, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.4859375, "completions/min_length": 43.0, "completions/max_length": 351.0, "completions/clipped_ratio": 0.0, "kl": 0.00876753773773089, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.6837606837606838, "step": 80}, {"loss": 0.0004655797965824604, "grad_norm": 1.1044949891253064, "learning_rate": 8.499999999999999e-07, "reward": 1.3734375, "reward_std": 0.18065465837717057, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.013125000149011612, "rewards/MazeReward/std": 0.04797708801925182, "rewards/MazeFormat/mean": 0.9921875, "rewards/MazeFormat/std": 0.07793438732624054, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.1203125, "completions/min_length": 45.0, "completions/max_length": 436.4, "completions/clipped_ratio": 0.0, "kl": 0.01163666148786433, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.7264957264957265, "step": 85}, {"loss": 0.0009792439639568328, "grad_norm": 0.996598772694048, "learning_rate": 9e-07, "reward": 1.471875, "reward_std": 0.23379422426223756, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.02250000014901161, "rewards/MazeReward/std": 0.06349937170743943, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 88.609375, "completions/min_length": 40.6, "completions/max_length": 337.0, "completions/clipped_ratio": 0.0, "kl": 0.02447981040459126, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.7692307692307693, "step": 90}, {"loss": 0.0014949593693017959, "grad_norm": 1.505195182911895, "learning_rate": 9.499999999999999e-07, "reward": 1.5171875, "reward_std": 0.20643335282802583, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.026875000912696123, "rewards/MazeReward/std": 0.06502393409609794, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 79.990625, "completions/min_length": 40.4, "completions/max_length": 321.2, "completions/clipped_ratio": 0.0, "kl": 0.03736886349506676, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.811965811965812, "step": 95}, {"loss": 0.0016774306073784827, "grad_norm": 1.1733766193739803, "learning_rate": 1e-06, "reward": 1.5046875, "reward_std": 0.182009756565094, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.025625000521540642, "rewards/MazeReward/std": 0.06656526178121566, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 80.1984375, "completions/min_length": 41.2, "completions/max_length": 283.4, "completions/clipped_ratio": 0.0, "kl": 0.041934849717654286, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.8547008547008547, "step": 100}, {"loss": 0.0013951731845736504, "grad_norm": 1.4119164170618441, "learning_rate": 9.999829128320873e-07, "reward": 1.4875, "reward_std": 0.22661330699920654, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.023750000447034837, "rewards/MazeReward/std": 0.06286502480506898, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 91.3640625, "completions/min_length": 41.2, "completions/max_length": 310.8, "completions/clipped_ratio": 0.0, "kl": 0.03487839815206826, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.8974358974358975, "step": 105}, {"loss": 0.0014822594821453094, "grad_norm": 1.2686400994592306, "learning_rate": 9.999316524962345e-07, "reward": 1.55625, "reward_std": 0.21021372228860855, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.030937500670552252, "rewards/MazeReward/std": 0.07310352176427841, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 103.746875, "completions/min_length": 38.6, "completions/max_length": 355.4, "completions/clipped_ratio": 0.0, "kl": 0.037058884068392216, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.9401709401709402, "step": 110}, {"loss": 0.0015832275152206422, "grad_norm": 1.0874137167924887, "learning_rate": 9.998462224960173e-07, "reward": 1.590625, "reward_std": 0.25334451496601107, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.034062500670552255, "rewards/MazeReward/std": 0.07615574449300766, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 103.7015625, "completions/min_length": 42.6, "completions/max_length": 252.2, "completions/clipped_ratio": 0.0, "kl": 0.03958274000324309, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.9829059829059829, "step": 115}, {"loss": 0.0018614999949932098, "grad_norm": 0.7102428626419721, "learning_rate": 9.99726628670463e-07, "reward": 1.5640625, "reward_std": 0.2902904152870178, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.031562500074505805, "rewards/MazeReward/std": 0.07654295042157173, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 104.0, "completions/min_length": 42.2, "completions/max_length": 333.8, "completions/clipped_ratio": 0.0, "kl": 0.046541464724577965, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.0256410256410255, "step": 120}, {"loss": 0.002314428612589836, "grad_norm": 0.7699986902071484, "learning_rate": 9.995728791936505e-07, "reward": 1.6576171875, "reward_std": 0.1521604984998703, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.04125000163912773, "rewards/MazeReward/std": 0.07993590980768203, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257904887199402, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 103.3359375, "completions/min_length": 44.2, "completions/max_length": 358.6, "completions/clipped_ratio": 0.0, "kl": 0.05786212412640453, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.0683760683760684, "step": 125}, {"loss": 0.00292002372443676, "grad_norm": 1.1483079529637177, "learning_rate": 9.993849845741523e-07, "reward": 1.675, "reward_std": 0.12972374111413956, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.042500000447034836, "rewards/MazeReward/std": 0.0803371638059616, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 84.5421875, "completions/min_length": 41.4, "completions/max_length": 333.6, "completions/clipped_ratio": 0.0, "kl": 0.07299737120047212, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.1111111111111112, "step": 130}, {"loss": 0.002901558205485344, "grad_norm": 0.22525600827587947, "learning_rate": 9.991629576543163e-07, "reward": 1.475, "reward_std": 0.06849094033241272, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.022500000335276125, "rewards/MazeReward/std": 0.0530305951833725, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 80.1828125, "completions/min_length": 41.2, "completions/max_length": 236.8, "completions/clipped_ratio": 0.0, "kl": 0.07253276985138654, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.1538461538461537, "step": 135}, {"loss": 0.0023476168513298034, "grad_norm": 0.7531123673947644, "learning_rate": 9.989068136093872e-07, "reward": 1.6044921875, "reward_std": 0.1165056936442852, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.035625001043081285, "rewards/MazeReward/std": 0.07613980323076248, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 92.290625, "completions/min_length": 40.2, "completions/max_length": 293.4, "completions/clipped_ratio": 0.0, "kl": 0.058682880457490684, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.1965811965811965, "step": 140}, {"loss": 0.0018699193373322488, "grad_norm": 0.9655821885141387, "learning_rate": 9.986165699464705e-07, "reward": 1.678125, "reward_std": 0.18339579403400422, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.04281250089406967, "rewards/MazeReward/std": 0.08515360653400421, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 107.709375, "completions/min_length": 40.4, "completions/max_length": 365.8, "completions/clipped_ratio": 0.0, "kl": 0.04674590518698096, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.2393162393162394, "step": 145}, {"loss": 0.0019011721014976501, "grad_norm": 0.8442355822998869, "learning_rate": 9.982922465033348e-07, "reward": 1.5875, "reward_std": 0.11563374549150467, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.03375000078231096, "rewards/MazeReward/std": 0.07314120382070541, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 106.034375, "completions/min_length": 39.8, "completions/max_length": 373.0, "completions/clipped_ratio": 0.0, "kl": 0.047533696377649905, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.282051282051282, "step": 150}, {"loss": 0.0023824993520975115, "grad_norm": 0.20366802328438316, "learning_rate": 9.979338654470567e-07, "reward": 1.7623046875, "reward_std": 0.05337072163820267, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.051249999180436136, "rewards/MazeReward/std": 0.08585172146558762, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 91.5609375, "completions/min_length": 41.4, "completions/max_length": 340.6, "completions/clipped_ratio": 0.0, "kl": 0.059565877495333555, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.3247863247863247, "step": 155}, {"loss": 0.0023328181356191634, "grad_norm": 0.8988364942792788, "learning_rate": 9.975414512725056e-07, "reward": 1.753125, "reward_std": 0.04218914955854416, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.050312502309679985, "rewards/MazeReward/std": 0.08477405905723571, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 93.615625, "completions/min_length": 40.2, "completions/max_length": 329.8, "completions/clipped_ratio": 0.0, "kl": 0.05831799576990306, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.3675213675213675, "step": 160}, {"loss": 0.001972428523004055, "grad_norm": 0.6869513437818591, "learning_rate": 9.971150308006687e-07, "reward": 1.6859375, "reward_std": 0.14878431037068368, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.04375000130385161, "rewards/MazeReward/std": 0.08216542676091194, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 96.5296875, "completions/min_length": 41.6, "completions/max_length": 275.0, "completions/clipped_ratio": 0.0, "kl": 0.04930293974466622, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.4102564102564101, "step": 165}, {"loss": 0.0020832683891057967, "grad_norm": 0.2002667446495595, "learning_rate": 9.966546331768192e-07, "reward": 1.575, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.03250000104308128, "rewards/MazeReward/std": 0.07358299195766449, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 96.5640625, "completions/min_length": 39.8, "completions/max_length": 289.4, "completions/clipped_ratio": 0.0, "kl": 0.05207843626849353, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.452991452991453, "step": 170}, {"loss": 0.0019463833421468734, "grad_norm": 0.48721820593956633, "learning_rate": 9.961602898685223e-07, "reward": 1.74375, "reward_std": 0.01767766922712326, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.04937500096857548, "rewards/MazeReward/std": 0.08451755046844482, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 105.240625, "completions/min_length": 39.0, "completions/max_length": 306.4, "completions/clipped_ratio": 0.0, "kl": 0.04866266236640513, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.4957264957264957, "step": 175}, {"loss": 0.0017639096826314927, "grad_norm": 0.46120689224078665, "learning_rate": 9.956320346634875e-07, "reward": 1.7826171875, "reward_std": 0.0430610993411392, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.053437501564621924, "rewards/MazeReward/std": 0.08518625646829606, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 116.1109375, "completions/min_length": 42.0, "completions/max_length": 333.8, "completions/clipped_ratio": 0.0, "kl": 0.04409475696738809, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.5384615384615383, "step": 180}, {"loss": 0.0018238790333271026, "grad_norm": 0.5148105099027588, "learning_rate": 9.95069903667256e-07, "reward": 1.596875, "reward_std": 0.031984337419271466, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.03468750007450581, "rewards/MazeReward/std": 0.07464145123958588, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.6703125, "completions/min_length": 44.0, "completions/max_length": 417.6, "completions/clipped_ratio": 0.0, "kl": 0.045597366779111324, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.5811965811965814, "step": 185}, {"loss": 0.0016768455505371093, "grad_norm": 0.5138975947309226, "learning_rate": 9.944739353007341e-07, "reward": 1.7375, "reward_std": 0.03535533845424652, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.048750000447034834, "rewards/MazeReward/std": 0.0851988285779953, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.4046875, "completions/min_length": 42.6, "completions/max_length": 423.0, "completions/clipped_ratio": 0.0, "kl": 0.04191551350522786, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.623931623931624, "step": 190}, {"loss": 0.0015711262822151184, "grad_norm": 0.9067737103191192, "learning_rate": 9.938441702975689e-07, "reward": 1.634375, "reward_std": 0.03808925524353981, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.038437500968575476, "rewards/MazeReward/std": 0.07839376032352448, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.6421875, "completions/min_length": 41.0, "completions/max_length": 352.6, "completions/clipped_ratio": 0.0, "kl": 0.03926404060330242, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.6666666666666665, "step": 195}, {"loss": 0.0016614319756627083, "grad_norm": 0.09424510883121427, "learning_rate": 9.931806517013612e-07, "reward": 1.840625, "reward_std": 0.02041158601641655, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.059062501043081285, "rewards/MazeReward/std": 0.09025485664606095, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.8484375, "completions/min_length": 40.2, "completions/max_length": 296.2, "completions/clipped_ratio": 0.0, "kl": 0.041527598001994195, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.7094017094017095, "step": 200}, {"loss": 0.0017192240804433823, "grad_norm": 0.677800532639946, "learning_rate": 9.924834248627258e-07, "reward": 1.896875, "reward_std": 0.00883883461356163, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.06468750163912773, "rewards/MazeReward/std": 0.09119289517402648, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.19375, "completions/min_length": 41.6, "completions/max_length": 344.0, "completions/clipped_ratio": 0.0, "kl": 0.04297648051287979, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.7521367521367521, "step": 205}, {"loss": 0.0014772934839129448, "grad_norm": 0.09625960474724543, "learning_rate": 9.917525374361911e-07, "reward": 1.5576171875, "reward_std": 0.05305119827389717, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0309375012293458, "rewards/MazeReward/std": 0.059112554788589476, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 121.43125, "completions/min_length": 41.4, "completions/max_length": 394.2, "completions/clipped_ratio": 0.0, "kl": 0.036925971927121284, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.7948717948717947, "step": 210}, {"loss": 0.0015943828970193864, "grad_norm": 0.8859900164071041, "learning_rate": 9.909880393769418e-07, "reward": 1.715625, "reward_std": 0.02651650384068489, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.04656250104308128, "rewards/MazeReward/std": 0.08200520724058151, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.7890625, "completions/min_length": 40.2, "completions/max_length": 342.8, "completions/clipped_ratio": 0.0, "kl": 0.03985867821611464, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.8376068376068377, "step": 215}, {"loss": 0.0014145723544061184, "grad_norm": 0.3847694911433562, "learning_rate": 9.901899829374047e-07, "reward": 1.5826171875, "reward_std": 0.04306109994649887, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.03343750163912773, "rewards/MazeReward/std": 0.06289278417825699, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 126.36875, "completions/min_length": 42.0, "completions/max_length": 354.8, "completions/clipped_ratio": 0.0, "kl": 0.03536163377575576, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.8803418803418803, "step": 220}, {"loss": 0.0013176266103982926, "grad_norm": 0.10814792355364673, "learning_rate": 9.893584226636772e-07, "reward": 1.684375, "reward_std": 0.022201896458864213, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.04343750029802322, "rewards/MazeReward/std": 0.08095956891775132, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.515625, "completions/min_length": 46.0, "completions/max_length": 369.6, "completions/clipped_ratio": 0.0, "kl": 0.03294098875485361, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.9230769230769231, "step": 225}, {"loss": 0.001161702163517475, "grad_norm": 0.0733353932479021, "learning_rate": 9.884934153917996e-07, "reward": 1.571875, "reward_std": 0.00883883461356163, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.03218750059604645, "rewards/MazeReward/std": 0.07254017442464829, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.4390625, "completions/min_length": 47.0, "completions/max_length": 401.4, "completions/clipped_ratio": 0.0, "kl": 0.029036648059263825, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 1.965811965811966, "step": 230}, {"loss": 0.0012318025343120097, "grad_norm": 0.09827642734369745, "learning_rate": 9.8759502024387e-07, "reward": 1.759375, "reward_std": 0.03061639815568924, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.05093750227242708, "rewards/MazeReward/std": 0.08228155076503754, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.7703125, "completions/min_length": 45.8, "completions/max_length": 362.6, "completions/clipped_ratio": 0.0, "kl": 0.030789492116309703, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.0085470085470085, "step": 235}, {"loss": 0.0014049587771296501, "grad_norm": 0.12057015042239135, "learning_rate": 9.866632986240029e-07, "reward": 1.7625, "reward_std": 0.023145502805709837, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.05125000160187483, "rewards/MazeReward/std": 0.07912175357341766, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.1171875, "completions/min_length": 46.0, "completions/max_length": 308.4, "completions/clipped_ratio": 0.0, "kl": 0.03512264594901353, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.051282051282051, "step": 240}, {"loss": 0.001234784722328186, "grad_norm": 0.0906374129160372, "learning_rate": 9.856983142141337e-07, "reward": 1.66875, "reward_std": 0.03104073107242584, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.041875001043081284, "rewards/MazeReward/std": 0.07904350236058236, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.7921875, "completions/min_length": 44.0, "completions/max_length": 424.8, "completions/clipped_ratio": 0.0, "kl": 0.030867059994488955, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.094017094017094, "step": 245}, {"loss": 0.0012160670943558217, "grad_norm": 0.09514702459075781, "learning_rate": 9.847001329696652e-07, "reward": 1.61875, "reward_std": 0.04671337753534317, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.036875000596046446, "rewards/MazeReward/std": 0.07626682817935944, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.0875, "completions/min_length": 45.4, "completions/max_length": 397.0, "completions/clipped_ratio": 0.0, "kl": 0.03039407222531736, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.1367521367521367, "step": 250}, {"loss": 0.0013320941478013992, "grad_norm": 0.0658722574421842, "learning_rate": 9.836688231149592e-07, "reward": 1.890625, "reward_std": 0.02041158601641655, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.06406250149011612, "rewards/MazeReward/std": 0.09125813692808152, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2296875, "completions/min_length": 40.0, "completions/max_length": 342.6, "completions/clipped_ratio": 0.0, "kl": 0.03329704308416694, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.1794871794871793, "step": 255}, {"loss": 0.0012724403291940688, "grad_norm": 1.136778575046199, "learning_rate": 9.826044551386742e-07, "reward": 1.8025390625, "reward_std": 0.051319288462400435, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.05562500096857548, "rewards/MazeReward/std": 0.08506897389888764, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490137964487076, "rewards/Format/mean": 0.2494140625, "rewards/Format/std": 0.003797071799635887, "completions/mean_length": 135.5328125, "completions/min_length": 45.2, "completions/max_length": 359.0, "completions/clipped_ratio": 0.0, "kl": 0.031809128040913494, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.2222222222222223, "step": 260}, {"loss": 0.0012527533806860446, "grad_norm": 0.09204891341686909, "learning_rate": 9.81507101788948e-07, "reward": 1.5125, "reward_std": 0.03535533845424652, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.026249999925494195, "rewards/MazeReward/std": 0.057460378110408786, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.78125, "completions/min_length": 42.6, "completions/max_length": 349.4, "completions/clipped_ratio": 0.0, "kl": 0.03131271551828831, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.264957264957265, "step": 265}, {"loss": 0.00133742094039917, "grad_norm": 0.10430657024771325, "learning_rate": 9.803768380684242e-07, "reward": 1.646875, "reward_std": 0.00883883461356163, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.039687500521540645, "rewards/MazeReward/std": 0.07787278592586518, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.8, "completions/min_length": 41.6, "completions/max_length": 355.8, "completions/clipped_ratio": 0.0, "kl": 0.033433203084859996, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.3076923076923075, "step": 270}, {"loss": 0.001098698191344738, "grad_norm": 0.6038584494068286, "learning_rate": 9.792137412291263e-07, "reward": 1.5853515625, "reward_std": 0.01943976073525846, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.03375000134110451, "rewards/MazeReward/std": 0.06722360253334045, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2494140625, "rewards/Format/std": 0.0066291259601712225, "completions/mean_length": 119.921875, "completions/min_length": 41.2, "completions/max_length": 353.6, "completions/clipped_ratio": 0.0, "kl": 0.027464703540317714, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.3504273504273505, "step": 275}, {"loss": 0.001182807795703411, "grad_norm": 0.11135024211508726, "learning_rate": 9.780178907671788e-07, "reward": 1.528125, "reward_std": 0.03061639815568924, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.027812500298023225, "rewards/MazeReward/std": 0.06877715289592742, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 109.83125, "completions/min_length": 39.0, "completions/max_length": 338.0, "completions/clipped_ratio": 0.0, "kl": 0.029570043087005615, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.393162393162393, "step": 280}, {"loss": 0.001240898482501507, "grad_norm": 0.08211805142370257, "learning_rate": 9.76789368417372e-07, "reward": 1.66875, "reward_std": 0.011572751402854919, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.041875001043081284, "rewards/MazeReward/std": 0.08127498030662536, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 105.146875, "completions/min_length": 44.0, "completions/max_length": 337.2, "completions/clipped_ratio": 0.0, "kl": 0.03102337378077209, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.435897435897436, "step": 285}, {"loss": 0.0011553137563169002, "grad_norm": 0.08713728279299042, "learning_rate": 9.755282581475767e-07, "reward": 1.5734375, "reward_std": 0.004419417306780815, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.032500000670552254, "rewards/MazeReward/std": 0.07290461361408233, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.95, "completions/min_length": 44.0, "completions/max_length": 347.2, "completions/clipped_ratio": 0.0, "kl": 0.02888093756046146, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.4786324786324787, "step": 290}, {"loss": 0.001207088492810726, "grad_norm": 0.6225014244879482, "learning_rate": 9.742346461530047e-07, "reward": 1.7078125, "reward_std": 0.048613591492176055, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.045937500335276125, "rewards/MazeReward/std": 0.07529200837016106, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.6546875, "completions/min_length": 43.0, "completions/max_length": 364.6, "completions/clipped_ratio": 0.0, "kl": 0.030179329228121787, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.5213675213675213, "step": 295}, {"loss": 0.0012148864567279815, "grad_norm": 0.10214096308893882, "learning_rate": 9.729086208503173e-07, "reward": 1.646875, "reward_std": 0.00883883461356163, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.039687500521540645, "rewards/MazeReward/std": 0.07787278592586518, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.06875, "completions/min_length": 44.0, "completions/max_length": 337.2, "completions/clipped_ratio": 0.0, "kl": 0.030370077083352952, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.564102564102564, "step": 300}, {"loss": 0.0010770590975880622, "grad_norm": 0.07743790574000314, "learning_rate": 9.715502728715825e-07, "reward": 1.6828125, "reward_std": 0.03503581583499908, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.04343750029802322, "rewards/MazeReward/std": 0.08254078775644302, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 111.646875, "completions/min_length": 42.2, "completions/max_length": 334.6, "completions/clipped_ratio": 0.0, "kl": 0.026924411568325014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.606837606837607, "step": 305}, {"loss": 0.0012557756155729294, "grad_norm": 0.5994910349443677, "learning_rate": 9.701596950580807e-07, "reward": 1.8076171875, "reward_std": 0.0355882428586483, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.05593749955296516, "rewards/MazeReward/std": 0.08937290459871292, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 109.6625, "completions/min_length": 41.0, "completions/max_length": 319.2, "completions/clipped_ratio": 0.0, "kl": 0.031393040483817455, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.6495726495726495, "step": 310}, {"loss": 0.0012647857889533042, "grad_norm": 0.5198429194092566, "learning_rate": 9.687369824539576e-07, "reward": 1.659375, "reward_std": 0.03808925524353981, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.040937500074505806, "rewards/MazeReward/std": 0.07862303704023361, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 103.4078125, "completions/min_length": 44.0, "completions/max_length": 359.2, "completions/clipped_ratio": 0.0, "kl": 0.031615292513743044, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.6923076923076925, "step": 315}, {"loss": 0.0014513864181935788, "grad_norm": 0.0888439306908069, "learning_rate": 9.672822322997304e-07, "reward": 1.925, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.0675000011920929, "rewards/MazeReward/std": 0.09291007369756699, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 98.4875, "completions/min_length": 40.2, "completions/max_length": 353.0, "completions/clipped_ratio": 0.0, "kl": 0.03628600856754929, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.735042735042735, "step": 320}, {"loss": 0.001220554392784834, "grad_norm": 0.1063652763694058, "learning_rate": 9.657955440256395e-07, "reward": 1.675, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.04250000156462193, "rewards/MazeReward/std": 0.07974326312541961, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.2421875, "completions/min_length": 42.2, "completions/max_length": 327.2, "completions/clipped_ratio": 0.0, "kl": 0.030510167125612497, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.7777777777777777, "step": 325}, {"loss": 0.0011792988516390324, "grad_norm": 0.06943368706988594, "learning_rate": 9.642770192448535e-07, "reward": 1.70625, "reward_std": 0.06123279705643654, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.04562500063329935, "rewards/MazeReward/std": 0.07920214980840683, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 141.16875, "completions/min_length": 47.0, "completions/max_length": 381.4, "completions/clipped_ratio": 0.0, "kl": 0.029479713435284792, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.8205128205128203, "step": 330}, {"loss": 0.0012946173548698426, "grad_norm": 0.07015473179679628, "learning_rate": 9.627267617465243e-07, "reward": 1.671875, "reward_std": 0.00883883461356163, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.04218750037252903, "rewards/MazeReward/std": 0.08093532919883728, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.9328125, "completions/min_length": 43.2, "completions/max_length": 373.4, "completions/clipped_ratio": 0.0, "kl": 0.03236345420591533, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.8632478632478633, "step": 335}, {"loss": 0.00122256800532341, "grad_norm": 0.0776687533387394, "learning_rate": 9.611448774886923e-07, "reward": 1.64375, "reward_std": 0.01767766922712326, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.03937500044703483, "rewards/MazeReward/std": 0.07964256107807159, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.1890625, "completions/min_length": 41.2, "completions/max_length": 346.4, "completions/clipped_ratio": 0.0, "kl": 0.03055897105950862, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.905982905982906, "step": 340}, {"loss": 0.0013021795079112054, "grad_norm": 0.5003867330049324, "learning_rate": 9.595314745910455e-07, "reward": 1.7, "reward_std": 0.03535533845424652, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.045000001601874826, "rewards/MazeReward/std": 0.07816829383373261, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.2296875, "completions/min_length": 43.8, "completions/max_length": 338.6, "completions/clipped_ratio": 0.0, "kl": 0.03254980493802577, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.948717948717949, "step": 345}, {"loss": 0.0011580833233892918, "grad_norm": 0.06887739934107479, "learning_rate": 9.578866633275286e-07, "reward": 1.7109375, "reward_std": 0.03366983756422996, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.04625000152736902, "rewards/MazeReward/std": 0.0803494080901146, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.540625, "completions/min_length": 42.8, "completions/max_length": 371.6, "completions/clipped_ratio": 0.0, "kl": 0.028944591525942087, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.9914529914529915, "step": 350}, {"loss": 0.001152261160314083, "grad_norm": 0.41790464887707834, "learning_rate": 9.562105561188068e-07, "reward": 1.5138671875, "reward_std": 0.04716099426150322, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.02656250037252903, "rewards/MazeReward/std": 0.06563240215182305, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 135.721875, "completions/min_length": 46.0, "completions/max_length": 388.0, "completions/clipped_ratio": 0.0, "kl": 0.028804128093179317, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.034188034188034, "step": 355}, {"loss": 0.001003839448094368, "grad_norm": 0.6825711537810063, "learning_rate": 9.545032675245813e-07, "reward": 1.6169921875, "reward_std": 0.02264951393008232, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.036875000782310964, "rewards/MazeReward/std": 0.07546763122081757, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 120.253125, "completions/min_length": 45.0, "completions/max_length": 381.8, "completions/clipped_ratio": 0.0, "kl": 0.025093021499924362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.076923076923077, "step": 360}, {"loss": 0.0011375134810805321, "grad_norm": 0.07988617640833492, "learning_rate": 9.527649142357594e-07, "reward": 1.725, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.047500000521540645, "rewards/MazeReward/std": 0.08356983661651611, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 112.9390625, "completions/min_length": 42.0, "completions/max_length": 298.6, "completions/clipped_ratio": 0.0, "kl": 0.028438647370785476, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.1196581196581197, "step": 365}, {"loss": 0.0012312138453125954, "grad_norm": 0.6277492109642122, "learning_rate": 9.509956150664795e-07, "reward": 1.68125, "reward_std": 0.05303300693631172, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.04312500096857548, "rewards/MazeReward/std": 0.08240270167589188, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.5203125, "completions/min_length": 45.0, "completions/max_length": 336.2, "completions/clipped_ratio": 0.0, "kl": 0.030780851130839438, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.1623931623931623, "step": 370}, {"loss": 0.0009977094829082488, "grad_norm": 0.5032696947783828, "learning_rate": 9.491954909459894e-07, "reward": 1.8482421875, "reward_std": 0.15122394859790803, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.060000001639127734, "rewards/MazeReward/std": 0.09219729751348496, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 119.6640625, "completions/min_length": 45.0, "completions/max_length": 680.4, "completions/clipped_ratio": 0.0015625, "kl": 0.02494008478242904, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.2051282051282053, "step": 375}, {"loss": 0.0010777967981994152, "grad_norm": 0.14253494037926903, "learning_rate": 9.473646649103817e-07, "reward": 1.675, "reward_std": 0.05713290050625801, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.042499999329447744, "rewards/MazeReward/std": 0.07458726465702056, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 106.1796875, "completions/min_length": 42.6, "completions/max_length": 303.8, "completions/clipped_ratio": 0.0, "kl": 0.02694298147689551, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.247863247863248, "step": 380}, {"loss": 0.0012950697913765908, "grad_norm": 0.3666414752240412, "learning_rate": 9.455032620941839e-07, "reward": 1.673046875, "reward_std": 0.03996608294546604, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.042500000819563864, "rewards/MazeReward/std": 0.081744384765625, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.249609375, "rewards/Format/std": 0.004419417306780815, "completions/mean_length": 98.975, "completions/min_length": 42.2, "completions/max_length": 270.4, "completions/clipped_ratio": 0.0, "kl": 0.03238045631442219, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.2905982905982905, "step": 385}, {"loss": 0.0013192273676395417, "grad_norm": 0.11241272737221532, "learning_rate": 9.436114097218058e-07, "reward": 1.771875, "reward_std": 0.05576692372560501, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.05218750089406967, "rewards/MazeReward/std": 0.08882526755332946, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 99.20625, "completions/min_length": 42.4, "completions/max_length": 304.0, "completions/clipped_ratio": 0.0, "kl": 0.03298132244963199, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.3333333333333335, "step": 390}, {"loss": 0.0012138230726122857, "grad_norm": 0.4786503603918001, "learning_rate": 9.416892370988442e-07, "reward": 1.6140625, "reward_std": 0.030935921147465704, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.03656250089406967, "rewards/MazeReward/std": 0.07630382627248763, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 109.65, "completions/min_length": 42.0, "completions/max_length": 320.2, "completions/clipped_ratio": 0.0, "kl": 0.030337114841677247, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.376068376068376, "step": 395}, {"loss": 0.0011456131003797054, "grad_norm": 0.4470622397798485, "learning_rate": 9.397368756032444e-07, "reward": 1.621875, "reward_std": 0.09722717925906181, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.037187499552965166, "rewards/MazeReward/std": 0.08185895383358002, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.1140625, "completions/min_length": 42.8, "completions/max_length": 322.0, "completions/clipped_ratio": 0.0, "kl": 0.02863241416634992, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.4188034188034186, "step": 400}, {"loss": 0.0012037239968776703, "grad_norm": 0.4108630055803702, "learning_rate": 9.377544586763214e-07, "reward": 1.746875, "reward_std": 0.06733967810869217, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.049687501043081284, "rewards/MazeReward/std": 0.08626897037029266, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.16875, "completions/min_length": 46.8, "completions/max_length": 346.6, "completions/clipped_ratio": 0.0, "kl": 0.03009375943802297, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.4615384615384617, "step": 405}, {"loss": 0.0012385781854391098, "grad_norm": 0.48309240343203796, "learning_rate": 9.357421218136386e-07, "reward": 1.659375, "reward_std": 0.067339675873518, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.04093750081956386, "rewards/MazeReward/std": 0.08117270022630692, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.2375, "completions/min_length": 41.0, "completions/max_length": 379.6, "completions/clipped_ratio": 0.0, "kl": 0.03096096939407289, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.5042735042735043, "step": 410}, {"loss": 0.0013860519975423813, "grad_norm": 0.07792718733846904, "learning_rate": 9.337000025557476e-07, "reward": 1.80625, "reward_std": 0.05303300693631172, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.055624999850988385, "rewards/MazeReward/std": 0.08941979110240936, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.11875, "completions/min_length": 42.6, "completions/max_length": 374.6, "completions/clipped_ratio": 0.0, "kl": 0.03464255495928228, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.547008547008547, "step": 415}, {"loss": 0.0012850278988480567, "grad_norm": 0.0701209753535078, "learning_rate": 9.316282404787869e-07, "reward": 1.721875, "reward_std": 0.00883883461356163, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.04718750026077032, "rewards/MazeReward/std": 0.07888221591711045, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.7765625, "completions/min_length": 44.2, "completions/max_length": 395.6, "completions/clipped_ratio": 0.0, "kl": 0.03212045237887651, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.58974358974359, "step": 420}, {"loss": 0.001200066413730383, "grad_norm": 0.06630152308360243, "learning_rate": 9.295269771849425e-07, "reward": 1.55, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "rewards/MazeReward/mean": 0.030000000447034835, "rewards/MazeReward/std": 0.06700892895460128, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 138.6421875, "completions/min_length": 42.2, "completions/max_length": 388.8, "completions/clipped_ratio": 0.0, "kl": 0.03000037738820538, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.6324786324786325, "step": 425}, {"loss": 0.0010827738791704179, "grad_norm": 0.6490659454594846, "learning_rate": 9.273963562927694e-07, "reward": 1.71875, "reward_std": 0.04692808985710144, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.04687500018626452, "rewards/MazeReward/std": 0.08021234273910523, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.49375, "completions/min_length": 41.2, "completions/max_length": 438.4, "completions/clipped_ratio": 0.0, "kl": 0.027061588026117533, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.6752136752136755, "step": 430}, {"loss": 0.0009752914309501648, "grad_norm": 0.06958012670951508, "learning_rate": 9.252365234273753e-07, "reward": 1.653125, "reward_std": 0.04419417306780815, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.040312501788139346, "rewards/MazeReward/std": 0.07667017579078675, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.9046875, "completions/min_length": 46.2, "completions/max_length": 402.6, "completions/clipped_ratio": 0.0, "kl": 0.024378333985805512, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.717948717948718, "step": 435}, {"loss": 0.001106039434671402, "grad_norm": 0.4209501303182819, "learning_rate": 9.230476262104676e-07, "reward": 1.778125, "reward_std": 0.09722718000411987, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.052812499925494194, "rewards/MazeReward/std": 0.09043156951665879, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.5953125, "completions/min_length": 40.6, "completions/max_length": 398.4, "completions/clipped_ratio": 0.0, "kl": 0.027644472965039312, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.7606837606837606, "step": 440}, {"loss": 0.0013174701482057572, "grad_norm": 0.6137811629084503, "learning_rate": 9.208298142502635e-07, "reward": 1.853125, "reward_std": 0.1190047413110733, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.060312502831220624, "rewards/MazeReward/std": 0.09812327623367309, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.809375, "completions/min_length": 40.6, "completions/max_length": 364.2, "completions/clipped_ratio": 0.0, "kl": 0.032928169379010795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.8034188034188032, "step": 445}, {"loss": 0.0012127190828323364, "grad_norm": 0.11178824029006519, "learning_rate": 9.185832391312642e-07, "reward": 1.6234375, "reward_std": 0.03977475576102733, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.03781250044703484, "rewards/MazeReward/std": 0.07775698155164719, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257904887199402, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.4984375, "completions/min_length": 39.2, "completions/max_length": 1026.6, "completions/clipped_ratio": 0.003125, "kl": 0.030316670378670096, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.8461538461538463, "step": 450}, {"loss": 0.0012919959612190724, "grad_norm": 0.11069044326468976, "learning_rate": 9.163080544038952e-07, "reward": 1.68125, "reward_std": 0.01767766922712326, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.043125000037252906, "rewards/MazeReward/std": 0.07953909039497375, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 105.140625, "completions/min_length": 38.6, "completions/max_length": 341.8, "completions/clipped_ratio": 0.0, "kl": 0.032291453005746006, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.888888888888889, "step": 455}, {"loss": 0.0010891311801970005, "grad_norm": 0.07418346654460156, "learning_rate": 9.1400441557401e-07, "reward": 1.684375, "reward_std": 0.06187183856964111, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.04343750141561031, "rewards/MazeReward/std": 0.08347698003053665, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.80625, "completions/min_length": 41.6, "completions/max_length": 426.8, "completions/clipped_ratio": 0.0, "kl": 0.027222537877969445, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.931623931623932, "step": 460}, {"loss": 0.001152960304170847, "grad_norm": 0.4513961725261483, "learning_rate": 9.116724800922629e-07, "reward": 1.671875, "reward_std": 0.06733967512845992, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0421875, "rewards/MazeReward/std": 0.0824856549501419, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.1578125, "completions/min_length": 42.2, "completions/max_length": 387.4, "completions/clipped_ratio": 0.0, "kl": 0.02881625925656408, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 3.9743589743589745, "step": 465}, {"loss": 0.0011756937950849534, "grad_norm": 0.07745948442852763, "learning_rate": 9.093124073433462e-07, "reward": 1.75625, "reward_std": 0.01767766922712326, "frac_reward_zero_std": 0.9875, "rewards/MazeReward/mean": 0.05062500163912773, "rewards/MazeReward/std": 0.08348544836044311, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.49375, "completions/min_length": 42.2, "completions/max_length": 383.6, "completions/clipped_ratio": 0.0, "kl": 0.029386252618860454, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.017094017094017, "step": 470}, {"loss": 0.0009222443215548993, "grad_norm": 0.49465611720279273, "learning_rate": 9.069243586350975e-07, "reward": 1.659375, "reward_std": 0.04419417306780815, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.040937500447034834, "rewards/MazeReward/std": 0.08035238832235336, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.2203125, "completions/min_length": 45.6, "completions/max_length": 411.8, "completions/clipped_ratio": 0.0, "kl": 0.023052448022644965, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.05982905982906, "step": 475}, {"loss": 0.0009421935304999351, "grad_norm": 0.3804415181399902, "learning_rate": 9.045084971874737e-07, "reward": 1.7873046875, "reward_std": 0.03590776561759412, "frac_reward_zero_std": 0.9625, "rewards/MazeReward/mean": 0.05375000089406967, "rewards/MazeReward/std": 0.08982028812170029, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 139.25625, "completions/min_length": 46.4, "completions/max_length": 397.4, "completions/clipped_ratio": 0.0, "kl": 0.023549946118146182, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.102564102564102, "step": 480}, {"loss": 0.0009268797934055328, "grad_norm": 0.057119814736412144, "learning_rate": 9.020649881213958e-07, "reward": 1.7421875, "reward_std": 0.07312507033348084, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.049375002086162564, "rewards/MazeReward/std": 0.0843595564365387, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.678125, "completions/min_length": 42.8, "completions/max_length": 399.2, "completions/clipped_ratio": 0.0, "kl": 0.023166414664592593, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.145299145299146, "step": 485}, {"loss": 0.0011070730164647103, "grad_norm": 0.06585259973199104, "learning_rate": 8.995939984474623e-07, "reward": 1.765625, "reward_std": 0.044194172322750094, "frac_reward_zero_std": 0.975, "rewards/MazeReward/mean": 0.0515625, "rewards/MazeReward/std": 0.0891284242272377, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.6171875, "completions/min_length": 43.2, "completions/max_length": 398.0, "completions/clipped_ratio": 0.0, "kl": 0.027671133645344525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.188034188034188, "step": 490}, {"loss": 0.0010565707460045814, "grad_norm": 0.07195953683323365, "learning_rate": 8.970956970545355e-07, "reward": 1.653125, "reward_std": 0.07954951077699661, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.04031250141561031, "rewards/MazeReward/std": 0.08194140195846558, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.1890625, "completions/min_length": 47.0, "completions/max_length": 413.6, "completions/clipped_ratio": 0.0, "kl": 0.026406785706058145, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.230769230769231, "step": 495}, {"loss": 0.0013033507391810417, "grad_norm": 0.7781681320743019, "learning_rate": 8.945702546981968e-07, "reward": 1.8734375, "reward_std": 0.12816310077905654, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.06250000223517418, "rewards/MazeReward/std": 0.09654748886823654, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 145.796875, "completions/min_length": 46.4, "completions/max_length": 362.6, "completions/clipped_ratio": 0.0, "kl": 0.03257816187106073, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.273504273504273, "step": 500}, {"loss": 0.0015162624418735503, "grad_norm": 0.3578918222768929, "learning_rate": 8.920178439890764e-07, "reward": 1.68125, "reward_std": 0.15235702246427535, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.043125000596046445, "rewards/MazeReward/std": 0.08651673793792725, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 137.4609375, "completions/min_length": 46.6, "completions/max_length": 359.8, "completions/clipped_ratio": 0.0, "kl": 0.03791391234844923, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.316239316239316, "step": 505}, {"loss": 0.002045181207358837, "grad_norm": 0.09537837140559144, "learning_rate": 8.894386393810562e-07, "reward": 1.75, "reward_std": 0.08984613418579102, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.05000000149011612, "rewards/MazeReward/std": 0.09007189571857452, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.7265625, "completions/min_length": 46.4, "completions/max_length": 450.0, "completions/clipped_ratio": 0.0, "kl": 0.05111516213510185, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.358974358974359, "step": 510}, {"loss": 0.0016084747388958932, "grad_norm": 0.4824559224968059, "learning_rate": 8.868328171593446e-07, "reward": 1.671875, "reward_std": 0.12037268280982971, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.04218750037252903, "rewards/MazeReward/std": 0.08702817857265473, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.690625, "completions/min_length": 46.0, "completions/max_length": 432.6, "completions/clipped_ratio": 0.0, "kl": 0.04020680082030594, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.401709401709402, "step": 515}, {"loss": 0.0012827066704630852, "grad_norm": 0.8577094695092758, "learning_rate": 8.842005554284295e-07, "reward": 1.6953125, "reward_std": 0.22211109101772308, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.044687502458691596, "rewards/MazeReward/std": 0.09304451793432236, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 146.3796875, "completions/min_length": 45.8, "completions/max_length": 430.6, "completions/clipped_ratio": 0.0, "kl": 0.032066563097760084, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.444444444444445, "step": 520}, {"loss": 0.0013256728649139403, "grad_norm": 0.10337241127092116, "learning_rate": 8.815420340999033e-07, "reward": 1.803125, "reward_std": 0.13762091994285583, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.05531249977648258, "rewards/MazeReward/std": 0.09847460389137268, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 153.4734375, "completions/min_length": 52.4, "completions/max_length": 485.2, "completions/clipped_ratio": 0.0, "kl": 0.03314277136232704, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.487179487179487, "step": 525}, {"loss": 0.0015093881636857986, "grad_norm": 0.7822120163109842, "learning_rate": 8.788574348801674e-07, "reward": 1.834375, "reward_std": 0.189805269241333, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.05843750089406967, "rewards/MazeReward/std": 0.0991519644856453, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 158.69375, "completions/min_length": 48.2, "completions/max_length": 485.6, "completions/clipped_ratio": 0.0, "kl": 0.03773373905569315, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.52991452991453, "step": 530}, {"loss": 0.0017939582467079163, "grad_norm": 0.7805650338633506, "learning_rate": 8.761469412580124e-07, "reward": 1.821875, "reward_std": 0.19949472695589066, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.05718750134110451, "rewards/MazeReward/std": 0.10765503495931625, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 148.371875, "completions/min_length": 44.8, "completions/max_length": 423.0, "completions/clipped_ratio": 0.0, "kl": 0.044839829625561836, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.572649572649572, "step": 535}, {"loss": 0.002171722613275051, "grad_norm": 0.4920259806333268, "learning_rate": 8.734107384920769e-07, "reward": 1.728125, "reward_std": 0.20894072502851485, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.04781249910593033, "rewards/MazeReward/std": 0.09794875681400299, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 141.103125, "completions/min_length": 48.0, "completions/max_length": 418.2, "completions/clipped_ratio": 0.0, "kl": 0.0542911626631394, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.615384615384615, "step": 540}, {"loss": 0.0018487025052309036, "grad_norm": 0.09997289593206347, "learning_rate": 8.706490135981855e-07, "reward": 1.859375, "reward_std": 0.1668713480234146, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.06093750074505806, "rewards/MazeReward/std": 0.10001743435859681, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.7015625, "completions/min_length": 46.0, "completions/max_length": 312.6, "completions/clipped_ratio": 0.0, "kl": 0.04621433573774993, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.6581196581196584, "step": 545}, {"loss": 0.0018147587776184081, "grad_norm": 1.1045318983420604, "learning_rate": 8.678619553365658e-07, "reward": 1.76875, "reward_std": 0.2367353230714798, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.051874999329447745, "rewards/MazeReward/std": 0.0953328013420105, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.375, "completions/min_length": 47.4, "completions/max_length": 329.2, "completions/clipped_ratio": 0.0, "kl": 0.0453646298032254, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.700854700854701, "step": 550}, {"loss": 0.002685732953250408, "grad_norm": 0.7177873849402897, "learning_rate": 8.650497541989481e-07, "reward": 1.796875, "reward_std": 0.2886998623609543, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.05468749850988388, "rewards/MazeReward/std": 0.1036534622311592, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.9046875, "completions/min_length": 45.4, "completions/max_length": 365.2, "completions/clipped_ratio": 0.0, "kl": 0.06714675026014447, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.743589743589744, "step": 555}, {"loss": 0.0031074721366167067, "grad_norm": 1.4582789306751023, "learning_rate": 8.622126023955445e-07, "reward": 2.04375, "reward_std": 0.49395993947982786, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.07937500178813935, "rewards/MazeReward/std": 0.13558758199214935, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.7328125, "completions/min_length": 45.4, "completions/max_length": 385.0, "completions/clipped_ratio": 0.0, "kl": 0.07768260380253196, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.786324786324786, "step": 560}, {"loss": 0.0037682272493839266, "grad_norm": 0.5188677978439834, "learning_rate": 8.593506938419119e-07, "reward": 1.940625, "reward_std": 0.25565096735954285, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.0690625011920929, "rewards/MazeReward/std": 0.11698136031627655, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.7515625, "completions/min_length": 45.2, "completions/max_length": 427.8, "completions/clipped_ratio": 0.0, "kl": 0.09420239464379847, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.829059829059829, "step": 565}, {"loss": 0.0038392230868339538, "grad_norm": 0.9088687585485157, "learning_rate": 8.564642241456986e-07, "reward": 2.059375, "reward_std": 0.38991106748580934, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.08093750327825547, "rewards/MazeReward/std": 0.13103083670139312, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.4171875, "completions/min_length": 45.4, "completions/max_length": 324.2, "completions/clipped_ratio": 0.0, "kl": 0.09596180045045913, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.871794871794872, "step": 570}, {"loss": 0.004007264971733093, "grad_norm": 0.9208085900960126, "learning_rate": 8.535533905932737e-07, "reward": 1.965625, "reward_std": 0.2675593763589859, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.0715625025331974, "rewards/MazeReward/std": 0.12371677309274673, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.2703125, "completions/min_length": 47.2, "completions/max_length": 315.8, "completions/clipped_ratio": 0.0, "kl": 0.10018278225325047, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.914529914529915, "step": 575}, {"loss": 0.0040367752313613895, "grad_norm": 0.6640474967383764, "learning_rate": 8.506183921362442e-07, "reward": 2.1015625, "reward_std": 0.29669988751411436, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.08531250059604645, "rewards/MazeReward/std": 0.12427427470684052, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.0890625, "completions/min_length": 44.0, "completions/max_length": 336.8, "completions/clipped_ratio": 0.0, "kl": 0.10094724758528173, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.957264957264957, "step": 580}, {"loss": 0.0037270143628120424, "grad_norm": 1.1663694130460183, "learning_rate": 8.47659429377856e-07, "reward": 2.028125, "reward_std": 0.45681936144828794, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.07781250178813934, "rewards/MazeReward/std": 0.13334924578666688, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.65625, "completions/min_length": 51.8, "completions/max_length": 414.4, "completions/clipped_ratio": 0.0, "kl": 0.09316067076288163, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.0, "step": 585}, {"loss": 0.004178965836763382, "grad_norm": 0.6440667033421631, "learning_rate": 8.446767045592829e-07, "reward": 2.1078125, "reward_std": 0.29008276015520096, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.08593750149011611, "rewards/MazeReward/std": 0.1316636636853218, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.6875, "completions/min_length": 49.6, "completions/max_length": 760.4, "completions/clipped_ratio": 0.0015625, "kl": 0.10447121229954064, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.042735042735043, "step": 590}, {"loss": 0.004393426328897476, "grad_norm": 0.9390368640983007, "learning_rate": 8.416704215458042e-07, "reward": 2.228125, "reward_std": 0.35290807485580444, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.0978125050663948, "rewards/MazeReward/std": 0.14086824804544448, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.784375, "completions/min_length": 48.0, "completions/max_length": 363.2, "completions/clipped_ratio": 0.0, "kl": 0.10983694661408663, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.085470085470085, "step": 595}, {"loss": 0.004272110760211945, "grad_norm": 0.1506841707882417, "learning_rate": 8.386407858128706e-07, "reward": 1.928125, "reward_std": 0.20876103043556213, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.0678125001490116, "rewards/MazeReward/std": 0.112618388235569, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.24375, "completions/min_length": 46.4, "completions/max_length": 436.0, "completions/clipped_ratio": 0.0, "kl": 0.10680326581932605, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.128205128205128, "step": 600}, {"loss": 0.004046444967389107, "grad_norm": 1.0740204866550926, "learning_rate": 8.355880044320597e-07, "reward": 1.890625, "reward_std": 0.38107420802116393, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.06406250335276127, "rewards/MazeReward/std": 0.1236990287899971, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.8765625, "completions/min_length": 49.6, "completions/max_length": 324.8, "completions/clipped_ratio": 0.0, "kl": 0.10116232139989734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.170940170940171, "step": 605}, {"loss": 0.003936619684100151, "grad_norm": 1.375038482551864, "learning_rate": 8.325122860569241e-07, "reward": 1.9625, "reward_std": 0.4536560237407684, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.07125000096857548, "rewards/MazeReward/std": 0.1281718507409096, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.2015625, "completions/min_length": 47.6, "completions/max_length": 382.6, "completions/clipped_ratio": 0.0, "kl": 0.09841636866331101, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.213675213675214, "step": 610}, {"loss": 0.003652118146419525, "grad_norm": 1.1723644368618256, "learning_rate": 8.294138409087289e-07, "reward": 2.1125, "reward_std": 0.3000807613134384, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.08625000119209289, "rewards/MazeReward/std": 0.12415469884872436, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.0296875, "completions/min_length": 46.4, "completions/max_length": 367.4, "completions/clipped_ratio": 0.0, "kl": 0.09129949091002346, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.256410256410256, "step": 615}, {"loss": 0.00377846360206604, "grad_norm": 0.9497437788096562, "learning_rate": 8.262928807620843e-07, "reward": 1.978125, "reward_std": 0.2670121371746063, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.07281250134110451, "rewards/MazeReward/std": 0.12639591097831726, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.265625, "completions/min_length": 48.0, "completions/max_length": 360.8, "completions/clipped_ratio": 0.0, "kl": 0.09446963081136346, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.299145299145299, "step": 620}, {"loss": 0.0030866391956806184, "grad_norm": 0.5190445499016438, "learning_rate": 8.231496189304704e-07, "reward": 2.36875, "reward_std": 0.16285933554172516, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.11187500059604645, "rewards/MazeReward/std": 0.1347514197230339, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.5703125, "completions/min_length": 48.6, "completions/max_length": 362.8, "completions/clipped_ratio": 0.0, "kl": 0.07716012820601463, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.3418803418803416, "step": 625}, {"loss": 0.0031170587986707687, "grad_norm": 0.09794822319206357, "learning_rate": 8.199842702516582e-07, "reward": 1.925, "reward_std": 0.13193328380584718, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.0675000011920929, "rewards/MazeReward/std": 0.11543771624565125, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 147.7828125, "completions/min_length": 50.4, "completions/max_length": 334.4, "completions/clipped_ratio": 0.0, "kl": 0.07791153551079333, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.384615384615385, "step": 630}, {"loss": 0.003522975742816925, "grad_norm": 1.1149140092457694, "learning_rate": 8.167970510730252e-07, "reward": 2.3625, "reward_std": 0.36211256980895995, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.11124999970197677, "rewards/MazeReward/std": 0.14898888319730758, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.090625, "completions/min_length": 50.0, "completions/max_length": 445.4, "completions/clipped_ratio": 0.0, "kl": 0.08806668547913432, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.427350427350428, "step": 635}, {"loss": 0.004394949972629547, "grad_norm": 0.6659839381335685, "learning_rate": 8.135881792367685e-07, "reward": 2.228125, "reward_std": 0.26746952831745147, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.09781250357627869, "rewards/MazeReward/std": 0.144122476875782, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.378125, "completions/min_length": 49.2, "completions/max_length": 373.2, "completions/clipped_ratio": 0.0, "kl": 0.1098570752888918, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.47008547008547, "step": 640}, {"loss": 0.0043334424495697025, "grad_norm": 0.9565892063556133, "learning_rate": 8.103578740650156e-07, "reward": 2.096875, "reward_std": 0.3764511190354824, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.08468750044703484, "rewards/MazeReward/std": 0.13841111958026886, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.1984375, "completions/min_length": 48.8, "completions/max_length": 334.0, "completions/clipped_ratio": 0.0, "kl": 0.10833388594910502, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.512820512820513, "step": 645}, {"loss": 0.004735597595572472, "grad_norm": 0.8226224361357182, "learning_rate": 8.071063563448339e-07, "reward": 1.909375, "reward_std": 0.3718319460749626, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.06593750081956387, "rewards/MazeReward/std": 0.13234255015850066, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.625, "completions/min_length": 49.4, "completions/max_length": 319.4, "completions/clipped_ratio": 0.0, "kl": 0.11840139674022794, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.555555555555555, "step": 650}, {"loss": 0.0049566149711608885, "grad_norm": 0.5636374604837406, "learning_rate": 8.038338483131406e-07, "reward": 2.240625, "reward_std": 0.4401042401790619, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.09906250238418579, "rewards/MazeReward/std": 0.1511495217680931, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 107.1859375, "completions/min_length": 43.4, "completions/max_length": 323.4, "completions/clipped_ratio": 0.0, "kl": 0.12389737367630005, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.598290598290598, "step": 655}, {"loss": 0.004563612118363381, "grad_norm": 0.10616510206581765, "learning_rate": 8.005405736415125e-07, "reward": 2.18125, "reward_std": 0.4140210926532745, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.0931250013411045, "rewards/MazeReward/std": 0.14360718429088593, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.5671875, "completions/min_length": 48.4, "completions/max_length": 273.4, "completions/clipped_ratio": 0.0, "kl": 0.1140824118629098, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.641025641025641, "step": 660}, {"loss": 0.004295756667852401, "grad_norm": 1.080309819167414, "learning_rate": 7.97226757420899e-07, "reward": 2.375, "reward_std": 0.3431886717677116, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.11250000596046447, "rewards/MazeReward/std": 0.14831583201885223, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 112.2515625, "completions/min_length": 43.4, "completions/max_length": 307.4, "completions/clipped_ratio": 0.0, "kl": 0.10736344018951058, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.683760683760684, "step": 665}, {"loss": 0.003763087838888168, "grad_norm": 1.0543109341141317, "learning_rate": 7.938926261462365e-07, "reward": 2.4296875, "reward_std": 0.4040140748023987, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.11812500208616257, "rewards/MazeReward/std": 0.15680089294910432, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.2609375, "completions/min_length": 54.0, "completions/max_length": 307.0, "completions/clipped_ratio": 0.0, "kl": 0.09406610750593244, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.726495726495727, "step": 670}, {"loss": 0.003623136132955551, "grad_norm": 1.0450825996397608, "learning_rate": 7.905384077009692e-07, "reward": 2.3216796875, "reward_std": 0.4457092106342316, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.10718750059604645, "rewards/MazeReward/std": 0.15488833487033843, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 136.3890625, "completions/min_length": 54.8, "completions/max_length": 348.0, "completions/clipped_ratio": 0.0, "kl": 0.09056174824945629, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.769230769230769, "step": 675}, {"loss": 0.0036685168743133545, "grad_norm": 0.988620318778312, "learning_rate": 7.871643313414718e-07, "reward": 2.475, "reward_std": 0.5890827000141143, "frac_reward_zero_std": 0.6875, "rewards/MazeReward/mean": 0.12281250208616257, "rewards/MazeReward/std": 0.16487362384796142, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.19375, "completions/min_length": 53.2, "completions/max_length": 408.2, "completions/clipped_ratio": 0.0, "kl": 0.09170094770379364, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.811965811965812, "step": 680}, {"loss": 0.004626954346895218, "grad_norm": 0.9104290040899861, "learning_rate": 7.837706276813818e-07, "reward": 2.3546875, "reward_std": 0.6117664694786071, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.11062500178813935, "rewards/MazeReward/std": 0.16645533144474028, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.44375, "completions/min_length": 47.8, "completions/max_length": 739.8, "completions/clipped_ratio": 0.0015625, "kl": 0.11567545076832175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.854700854700854, "step": 685}, {"loss": 0.004906488955020905, "grad_norm": 0.7226788498026553, "learning_rate": 7.803575286758363e-07, "reward": 2.1091796875, "reward_std": 0.35595683455467225, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.08625000044703483, "rewards/MazeReward/std": 0.14030689746141434, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 112.5953125, "completions/min_length": 46.4, "completions/max_length": 679.4, "completions/clipped_ratio": 0.0015625, "kl": 0.12265814091078937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.897435897435898, "step": 690}, {"loss": 0.004843691736459732, "grad_norm": 0.8459505519586844, "learning_rate": 7.769252676056186e-07, "reward": 2.2890625, "reward_std": 0.5048299908638001, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.10406250357627869, "rewards/MazeReward/std": 0.1560496523976326, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.75, "completions/min_length": 48.6, "completions/max_length": 640.8, "completions/clipped_ratio": 0.0015625, "kl": 0.12108418410643935, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.94017094017094, "step": 695}, {"loss": 0.005365237221121788, "grad_norm": 1.124400597193716, "learning_rate": 7.734740790612136e-07, "reward": 2.0466796875, "reward_std": 0.47798594236373904, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.08031250461935997, "rewards/MazeReward/std": 0.14358068108558655, "rewards/MazeFormat/mean": 0.99375, "rewards/MazeFormat/std": 0.07071067690849304, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 126.9140625, "completions/min_length": 50.8, "completions/max_length": 1695.6, "completions/clipped_ratio": 0.00625, "kl": 0.13407598659396172, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 5.982905982905983, "step": 700}, {"loss": 0.005399256944656372, "grad_norm": 1.8347446426315819, "learning_rate": 7.700041989267736e-07, "reward": 2.359375, "reward_std": 0.6089624762535095, "frac_reward_zero_std": 0.6875, "rewards/MazeReward/mean": 0.11093750447034836, "rewards/MazeReward/std": 0.1633380174636841, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.090625, "completions/min_length": 43.6, "completions/max_length": 320.6, "completions/clipped_ratio": 0.0, "kl": 0.1349698563106358, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.0256410256410255, "step": 705}, {"loss": 0.00543157123029232, "grad_norm": 0.7119869882019543, "learning_rate": 7.665158643639969e-07, "reward": 2.35, "reward_std": 0.17233721613883973, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.11000000089406967, "rewards/MazeReward/std": 0.14970978498458862, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 106.2421875, "completions/min_length": 46.0, "completions/max_length": 288.6, "completions/clipped_ratio": 0.0, "kl": 0.13576708221808076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.068376068376068, "step": 710}, {"loss": 0.005049209296703339, "grad_norm": 1.3542356621009914, "learning_rate": 7.63009313795917e-07, "reward": 2.453125, "reward_std": 0.3907487615942955, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.12031250596046447, "rewards/MazeReward/std": 0.1551662117242813, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.7984375, "completions/min_length": 47.2, "completions/max_length": 324.0, "completions/clipped_ratio": 0.0, "kl": 0.1262268964201212, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.111111111111111, "step": 715}, {"loss": 0.004493502527475357, "grad_norm": 1.0513048424099745, "learning_rate": 7.594847868906076e-07, "reward": 2.2828125, "reward_std": 0.45589269399642945, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.10375000387430192, "rewards/MazeReward/std": 0.1544831484556198, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257904887199402, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.815625, "completions/min_length": 46.6, "completions/max_length": 1031.0, "completions/clipped_ratio": 0.003125, "kl": 0.1123365402687341, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.153846153846154, "step": 720}, {"loss": 0.004856729134917259, "grad_norm": 1.4851889198441417, "learning_rate": 7.559425245448005e-07, "reward": 2.484375, "reward_std": 0.6057861864566803, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.12343750149011612, "rewards/MazeReward/std": 0.17167751789093016, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 112.59375, "completions/min_length": 43.6, "completions/max_length": 316.4, "completions/clipped_ratio": 0.0, "kl": 0.12141171535477042, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.196581196581197, "step": 725}, {"loss": 0.0048632964491844176, "grad_norm": 0.8669441021866194, "learning_rate": 7.523827688674219e-07, "reward": 2.409375, "reward_std": 0.45396568775177004, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1159375011920929, "rewards/MazeReward/std": 0.16991185545921325, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 107.7328125, "completions/min_length": 44.4, "completions/max_length": 272.8, "completions/clipped_ratio": 0.0, "kl": 0.1215637393295765, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.239316239316239, "step": 730}, {"loss": 0.0050648033618927, "grad_norm": 1.4722002374740226, "learning_rate": 7.488057631630437e-07, "reward": 2.525, "reward_std": 0.38043124973773956, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1275000035762787, "rewards/MazeReward/std": 0.16860854327678682, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.7125, "completions/min_length": 42.0, "completions/max_length": 329.4, "completions/clipped_ratio": 0.0, "kl": 0.1266021172516048, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.282051282051282, "step": 735}, {"loss": 0.004797622561454773, "grad_norm": 0.55055024572662, "learning_rate": 7.452117519152541e-07, "reward": 2.06875, "reward_std": 0.20705547034740449, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.08187500089406967, "rewards/MazeReward/std": 0.13211893141269684, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 107.0140625, "completions/min_length": 44.2, "completions/max_length": 289.8, "completions/clipped_ratio": 0.0, "kl": 0.11993481060490012, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.3247863247863245, "step": 740}, {"loss": 0.005077499523758888, "grad_norm": 0.5312250136982063, "learning_rate": 7.416009807699481e-07, "reward": 2.1375, "reward_std": 0.28113911747932435, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.08875000178813934, "rewards/MazeReward/std": 0.14652519375085832, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 105.2875, "completions/min_length": 44.0, "completions/max_length": 288.6, "completions/clipped_ratio": 0.0, "kl": 0.1269465253688395, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.367521367521368, "step": 745}, {"loss": 0.004760226607322693, "grad_norm": 0.9992359610718061, "learning_rate": 7.379736965185368e-07, "reward": 2.08125, "reward_std": 0.27081257551908494, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.0831249974668026, "rewards/MazeReward/std": 0.13845863491296767, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.621875, "completions/min_length": 39.0, "completions/max_length": 344.2, "completions/clipped_ratio": 0.0, "kl": 0.11899666213430464, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.410256410256411, "step": 750}, {"loss": 0.004681786894798279, "grad_norm": 1.1801694424009033, "learning_rate": 7.343301470810807e-07, "reward": 2.040625, "reward_std": 0.2878511890769005, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.07906250059604644, "rewards/MazeReward/std": 0.13535543382167817, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.45625, "completions/min_length": 44.2, "completions/max_length": 345.6, "completions/clipped_ratio": 0.0, "kl": 0.11703309016302228, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.452991452991453, "step": 755}, {"loss": 0.004368682950735092, "grad_norm": 1.0955414704601263, "learning_rate": 7.306705814893439e-07, "reward": 2.4, "reward_std": 0.4841845452785492, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.11500000506639481, "rewards/MazeReward/std": 0.16087428629398345, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.95625, "completions/min_length": 44.2, "completions/max_length": 342.0, "completions/clipped_ratio": 0.0, "kl": 0.1092055644840002, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.495726495726496, "step": 760}, {"loss": 0.004682149365544319, "grad_norm": 0.5510477790957099, "learning_rate": 7.269952498697734e-07, "reward": 2.315625, "reward_std": 0.30975441038608553, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.10656250566244126, "rewards/MazeReward/std": 0.15523205399513246, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.575, "completions/min_length": 39.2, "completions/max_length": 357.4, "completions/clipped_ratio": 0.0, "kl": 0.11705516274087131, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.538461538461538, "step": 765}, {"loss": 0.003985384851694107, "grad_norm": 1.2102669313683418, "learning_rate": 7.233044034264033e-07, "reward": 2.29375, "reward_std": 0.31227283328771593, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.10437500327825547, "rewards/MazeReward/std": 0.15053761154413223, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.95625, "completions/min_length": 46.2, "completions/max_length": 383.8, "completions/clipped_ratio": 0.0, "kl": 0.0996331512928009, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.581196581196581, "step": 770}, {"loss": 0.0038939833641052244, "grad_norm": 0.7817060796702399, "learning_rate": 7.195982944236852e-07, "reward": 2.3921875, "reward_std": 0.529007887840271, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1143750011920929, "rewards/MazeReward/std": 0.16184994280338288, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 139.1375, "completions/min_length": 44.4, "completions/max_length": 706.8, "completions/clipped_ratio": 0.0015625, "kl": 0.0973482757806778, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.6239316239316235, "step": 775}, {"loss": 0.003926222771406173, "grad_norm": 0.09525023329994642, "learning_rate": 7.158771761692464e-07, "reward": 2.28125, "reward_std": 0.3239602565765381, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.10312499850988388, "rewards/MazeReward/std": 0.14976677149534226, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.196875, "completions/min_length": 43.2, "completions/max_length": 370.6, "completions/clipped_ratio": 0.0, "kl": 0.09814362931065261, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.666666666666667, "step": 780}, {"loss": 0.00413544774055481, "grad_norm": 0.8633826012698371, "learning_rate": 7.121413029965769e-07, "reward": 2.353125, "reward_std": 0.2962625741958618, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.11031250134110451, "rewards/MazeReward/std": 0.14826812595129013, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.6578125, "completions/min_length": 48.4, "completions/max_length": 360.0, "completions/clipped_ratio": 0.0, "kl": 0.10337876132689416, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.7094017094017095, "step": 785}, {"loss": 0.004654894769191742, "grad_norm": 0.8994361258648217, "learning_rate": 7.083909302476452e-07, "reward": 2.284375, "reward_std": 0.15024999976158143, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.10343750342726707, "rewards/MazeReward/std": 0.14751739650964737, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.65625, "completions/min_length": 47.2, "completions/max_length": 382.8, "completions/clipped_ratio": 0.0, "kl": 0.11636109538376331, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.752136752136752, "step": 790}, {"loss": 0.004891832917928695, "grad_norm": 1.3106559165017158, "learning_rate": 7.04626314255447e-07, "reward": 2.21875, "reward_std": 0.4313662528991699, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.09687500149011612, "rewards/MazeReward/std": 0.16106954216957092, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.7375, "completions/min_length": 44.4, "completions/max_length": 348.4, "completions/clipped_ratio": 0.0, "kl": 0.12230570400133729, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.794871794871795, "step": 795}, {"loss": 0.004900344088673592, "grad_norm": 1.5334334590930974, "learning_rate": 7.008477123264847e-07, "reward": 2.59375, "reward_std": 0.5986619353294372, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.13437500596046448, "rewards/MazeReward/std": 0.17435938417911528, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.8796875, "completions/min_length": 44.0, "completions/max_length": 291.6, "completions/clipped_ratio": 0.0, "kl": 0.1225132972933352, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.837606837606837, "step": 800}, {"loss": 0.004991311207413673, "grad_norm": 0.8435043549951452, "learning_rate": 6.970553827231808e-07, "reward": 2.4609375, "reward_std": 0.32227787673473357, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.12125000357627869, "rewards/MazeReward/std": 0.1671494722366333, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.9640625, "completions/min_length": 44.0, "completions/max_length": 297.0, "completions/clipped_ratio": 0.0, "kl": 0.12477834653109313, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.880341880341881, "step": 805}, {"loss": 0.004670744389295578, "grad_norm": 1.0045786602311646, "learning_rate": 6.932495846462261e-07, "reward": 2.36875, "reward_std": 0.1768634021282196, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.1118750050663948, "rewards/MazeReward/std": 0.15320810079574584, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 109.0140625, "completions/min_length": 47.4, "completions/max_length": 320.2, "completions/clipped_ratio": 0.0, "kl": 0.1167522537522018, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.923076923076923, "step": 810}, {"loss": 0.00478287898004055, "grad_norm": 1.28987641874247, "learning_rate": 6.894305782168638e-07, "reward": 2.346875, "reward_std": 0.5353424847126007, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.10968749970197678, "rewards/MazeReward/std": 0.1676226884126663, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.41875, "completions/min_length": 46.2, "completions/max_length": 355.6, "completions/clipped_ratio": 0.0, "kl": 0.11954717123880983, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.965811965811966, "step": 815}, {"loss": 0.004670187830924988, "grad_norm": 0.46566542739911215, "learning_rate": 6.855986244591103e-07, "reward": 2.2888671875, "reward_std": 0.41036257445812224, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.1040624961256981, "rewards/MazeReward/std": 0.1657874584197998, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 116.9609375, "completions/min_length": 43.8, "completions/max_length": 320.8, "completions/clipped_ratio": 0.0, "kl": 0.11674826825037599, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.0085470085470085, "step": 820}, {"loss": 0.00523756854236126, "grad_norm": 0.914032778978338, "learning_rate": 6.817539852819148e-07, "reward": 2.3, "reward_std": 0.24220315217971802, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.10500000417232513, "rewards/MazeReward/std": 0.15769463479518891, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 104.7578125, "completions/min_length": 43.6, "completions/max_length": 298.8, "completions/clipped_ratio": 0.0, "kl": 0.13094109743833543, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.051282051282051, "step": 825}, {"loss": 0.005041994154453278, "grad_norm": 0.9417500607932524, "learning_rate": 6.778969234612583e-07, "reward": 2.396875, "reward_std": 0.32111557126045226, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.11468750163912773, "rewards/MazeReward/std": 0.14987295717000962, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 106.153125, "completions/min_length": 44.2, "completions/max_length": 327.4, "completions/clipped_ratio": 0.0, "kl": 0.12604689141735434, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.094017094017094, "step": 830}, {"loss": 0.005015048757195472, "grad_norm": 0.5286700058047759, "learning_rate": 6.740277026221922e-07, "reward": 2.41875, "reward_std": 0.342887257039547, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.11687499880790711, "rewards/MazeReward/std": 0.17734501659870147, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.1421875, "completions/min_length": 42.2, "completions/max_length": 307.4, "completions/clipped_ratio": 0.0, "kl": 0.12536690728738903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.136752136752137, "step": 835}, {"loss": 0.004260452091693878, "grad_norm": 0.7403959888442324, "learning_rate": 6.701465872208216e-07, "reward": 2.509375, "reward_std": 0.5740605995059014, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1259375035762787, "rewards/MazeReward/std": 0.1605320692062378, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.98125, "completions/min_length": 45.8, "completions/max_length": 366.6, "completions/clipped_ratio": 0.0, "kl": 0.10649480815045535, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.17948717948718, "step": 840}, {"loss": 0.004197680950164795, "grad_norm": 0.8837213501426033, "learning_rate": 6.662538425262284e-07, "reward": 2.371875, "reward_std": 0.3282697722315788, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.11218750327825547, "rewards/MazeReward/std": 0.15610671639442444, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.059375, "completions/min_length": 42.0, "completions/max_length": 309.0, "completions/clipped_ratio": 0.0, "kl": 0.10494111906737089, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.222222222222222, "step": 845}, {"loss": 0.004404155537486076, "grad_norm": 0.9404232152801563, "learning_rate": 6.623497346023417e-07, "reward": 2.6326171875, "reward_std": 0.6332006573677063, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.13843750059604645, "rewards/MazeReward/std": 0.18257658779621125, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 122.8390625, "completions/min_length": 46.0, "completions/max_length": 378.2, "completions/clipped_ratio": 0.0, "kl": 0.11009907629340887, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.264957264957265, "step": 850}, {"loss": 0.004830294847488403, "grad_norm": 0.8842868714790371, "learning_rate": 6.584345302897522e-07, "reward": 2.678125, "reward_std": 0.22566969692707062, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.1428125023841858, "rewards/MazeReward/std": 0.15798466503620148, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.6125, "completions/min_length": 46.2, "completions/max_length": 301.6, "completions/clipped_ratio": 0.0, "kl": 0.12074681739322841, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.3076923076923075, "step": 855}, {"loss": 0.0052056387066841125, "grad_norm": 0.12253566912712557, "learning_rate": 6.545084971874736e-07, "reward": 2.134375, "reward_std": 0.07553946599364281, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.0884375050663948, "rewards/MazeReward/std": 0.1338231921195984, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 111.078125, "completions/min_length": 44.8, "completions/max_length": 321.6, "completions/clipped_ratio": 0.0, "kl": 0.13014833349734545, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.35042735042735, "step": 860}, {"loss": 0.004796605557203293, "grad_norm": 0.9828254166110735, "learning_rate": 6.505719036346537e-07, "reward": 2.08125, "reward_std": 0.0818540021777153, "frac_reward_zero_std": 0.95, "rewards/MazeReward/mean": 0.08312500044703483, "rewards/MazeReward/std": 0.1296190157532692, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.615625, "completions/min_length": 43.4, "completions/max_length": 369.6, "completions/clipped_ratio": 0.0, "kl": 0.11990115805529059, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.3931623931623935, "step": 865}, {"loss": 0.0046964243054389955, "grad_norm": 1.5641540692769706, "learning_rate": 6.466250186922324e-07, "reward": 2.278125, "reward_std": 0.37824141681194307, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.10281250178813935, "rewards/MazeReward/std": 0.16968013048171998, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.7828125, "completions/min_length": 45.8, "completions/max_length": 427.8, "completions/clipped_ratio": 0.0, "kl": 0.11739973742514849, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.435897435897436, "step": 870}, {"loss": 0.0041921079158782956, "grad_norm": 0.9879868922490628, "learning_rate": 6.426681121245527e-07, "reward": 2.690625, "reward_std": 0.39118525981903074, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.14406249672174454, "rewards/MazeReward/std": 0.17793795466423035, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.378125, "completions/min_length": 48.2, "completions/max_length": 407.8, "completions/clipped_ratio": 0.0, "kl": 0.10477358950302004, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.478632478632479, "step": 875}, {"loss": 0.0038834869861602782, "grad_norm": 1.2740447986881183, "learning_rate": 6.387014543809223e-07, "reward": 2.121875, "reward_std": 0.42704967558383944, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.0871875025331974, "rewards/MazeReward/std": 0.14741710275411607, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.2828125, "completions/min_length": 48.0, "completions/max_length": 352.6, "completions/clipped_ratio": 0.0, "kl": 0.09708790634758771, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.521367521367521, "step": 880}, {"loss": 0.004379009455442428, "grad_norm": 0.8816744498090584, "learning_rate": 6.347253165771289e-07, "reward": 2.521875, "reward_std": 0.49345279932022096, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1271875023841858, "rewards/MazeReward/std": 0.18064941763877868, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.08125, "completions/min_length": 48.2, "completions/max_length": 338.0, "completions/clipped_ratio": 0.0, "kl": 0.10946230506524443, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.564102564102564, "step": 885}, {"loss": 0.0043511584401130675, "grad_norm": 0.7904246843598328, "learning_rate": 6.307399704769098e-07, "reward": 2.3984375, "reward_std": 0.4378116011619568, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.11500000059604645, "rewards/MazeReward/std": 0.16837832033634187, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.7390625, "completions/min_length": 46.4, "completions/max_length": 357.2, "completions/clipped_ratio": 0.0, "kl": 0.10876897526904941, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.6068376068376065, "step": 890}, {"loss": 0.004724665731191635, "grad_norm": 0.6831600099614877, "learning_rate": 6.26745688473377e-07, "reward": 2.159375, "reward_std": 0.25841794312000277, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.09093750268220901, "rewards/MazeReward/std": 0.15757765173912047, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.16875, "completions/min_length": 42.2, "completions/max_length": 394.4, "completions/clipped_ratio": 0.0, "kl": 0.11811227649450302, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.64957264957265, "step": 895}, {"loss": 0.004878251254558564, "grad_norm": 0.9463456998621641, "learning_rate": 6.227427435703995e-07, "reward": 2.659375, "reward_std": 0.5090285480022431, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.140937502682209, "rewards/MazeReward/std": 0.18577449023723602, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.315625, "completions/min_length": 43.4, "completions/max_length": 392.4, "completions/clipped_ratio": 0.0, "kl": 0.12197576817125082, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.6923076923076925, "step": 900}, {"loss": 0.004768900573253632, "grad_norm": 0.9935293007275701, "learning_rate": 6.187314093639443e-07, "reward": 2.337109375, "reward_std": 0.4040724813938141, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1087500050663948, "rewards/MazeReward/std": 0.171444433927536, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.249609375, "rewards/Format/std": 0.004419417306780815, "completions/mean_length": 124.259375, "completions/min_length": 47.0, "completions/max_length": 310.4, "completions/clipped_ratio": 0.0, "kl": 0.11919824471697212, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.735042735042735, "step": 905}, {"loss": 0.0042796477675437926, "grad_norm": 0.427378607750444, "learning_rate": 6.147119600233758e-07, "reward": 2.640625, "reward_std": 0.5469089686870575, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.13906250447034835, "rewards/MazeReward/std": 0.17976903915405273, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.9703125, "completions/min_length": 48.8, "completions/max_length": 401.8, "completions/clipped_ratio": 0.0, "kl": 0.10699953152798117, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.777777777777778, "step": 910}, {"loss": 0.003946560621261597, "grad_norm": 1.114364765131464, "learning_rate": 6.106846702727172e-07, "reward": 2.365625, "reward_std": 0.5313502073287963, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.11156250238418579, "rewards/MazeReward/std": 0.17100094854831696, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.7734375, "completions/min_length": 47.8, "completions/max_length": 294.8, "completions/clipped_ratio": 0.0, "kl": 0.09865478742867709, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.82051282051282, "step": 915}, {"loss": 0.003832873702049255, "grad_norm": 1.1322236029400599, "learning_rate": 6.066498153718734e-07, "reward": 2.29375, "reward_std": 0.3965612709522247, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.10437500178813934, "rewards/MazeReward/std": 0.159711055457592, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 137.840625, "completions/min_length": 49.6, "completions/max_length": 340.0, "completions/clipped_ratio": 0.0, "kl": 0.0958178190048784, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.863247863247864, "step": 920}, {"loss": 0.0037041474133729935, "grad_norm": 0.9033191349982307, "learning_rate": 6.026076710978171e-07, "reward": 2.5857421875, "reward_std": 0.520411816239357, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.13375000059604644, "rewards/MazeReward/std": 0.17141990661621093, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 137.2859375, "completions/min_length": 51.6, "completions/max_length": 350.4, "completions/clipped_ratio": 0.0, "kl": 0.09261846840381623, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.905982905982906, "step": 925}, {"loss": 0.004257069900631905, "grad_norm": 1.1026841088488806, "learning_rate": 5.985585137257401e-07, "reward": 2.884375, "reward_std": 0.6288280010223388, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.16343750655651093, "rewards/MazeReward/std": 0.1841311573982239, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.3171875, "completions/min_length": 46.4, "completions/max_length": 341.0, "completions/clipped_ratio": 0.0, "kl": 0.10645872093737126, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.948717948717949, "step": 930}, {"loss": 0.004258693009614944, "grad_norm": 1.031998752242879, "learning_rate": 5.945026200101702e-07, "reward": 2.790625, "reward_std": 0.3497179388999939, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.15406250357627868, "rewards/MazeReward/std": 0.18062105774879456, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.8984375, "completions/min_length": 45.8, "completions/max_length": 370.6, "completions/clipped_ratio": 0.0, "kl": 0.10646879714913667, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.9914529914529915, "step": 935}, {"loss": 0.004184092953801155, "grad_norm": 1.037419249263571, "learning_rate": 5.90440267166055e-07, "reward": 2.4390625, "reward_std": 0.5330159664154053, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1190625011920929, "rewards/MazeReward/std": 0.1763996571302414, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.0609375, "completions/min_length": 44.8, "completions/max_length": 452.6, "completions/clipped_ratio": 0.0, "kl": 0.10459752553142607, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.034188034188034, "step": 940}, {"loss": 0.004381101951003075, "grad_norm": 1.0914915082654844, "learning_rate": 5.863717328498152e-07, "reward": 2.4982421875, "reward_std": 0.3982539355754852, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.12499999850988389, "rewards/MazeReward/std": 0.1781415581703186, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 133.3921875, "completions/min_length": 52.2, "completions/max_length": 391.6, "completions/clipped_ratio": 0.0, "kl": 0.10954389749094844, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.076923076923077, "step": 945}, {"loss": 0.004118229448795319, "grad_norm": 0.9990611755328862, "learning_rate": 5.82297295140367e-07, "reward": 2.390625, "reward_std": 0.492414253950119, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.11406250298023224, "rewards/MazeReward/std": 0.1696094572544098, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.075, "completions/min_length": 50.8, "completions/max_length": 370.0, "completions/clipped_ratio": 0.0, "kl": 0.10294655775651336, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.11965811965812, "step": 950}, {"loss": 0.004560865834355355, "grad_norm": 0.501342223466357, "learning_rate": 5.782172325201155e-07, "reward": 2.165625, "reward_std": 0.3272204905748367, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.09156250134110451, "rewards/MazeReward/std": 0.15920471251010895, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.9890625, "completions/min_length": 45.4, "completions/max_length": 392.2, "completions/clipped_ratio": 0.0, "kl": 0.11401240844279528, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.162393162393162, "step": 955}, {"loss": 0.004170581325888633, "grad_norm": 0.9085688756311281, "learning_rate": 5.741318238559209e-07, "reward": 2.45, "reward_std": 0.3110924273729324, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.12031249850988388, "rewards/MazeReward/std": 0.175260728597641, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490137964487076, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 139.0234375, "completions/min_length": 48.4, "completions/max_length": 785.4, "completions/clipped_ratio": 0.003125, "kl": 0.10424431953579187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.205128205128204, "step": 960}, {"loss": 0.004588994011282921, "grad_norm": 0.4287849382710462, "learning_rate": 5.700413483800389e-07, "reward": 2.875, "reward_std": 0.42599530816078185, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.16250000298023223, "rewards/MazeReward/std": 0.1969437777996063, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.1703125, "completions/min_length": 45.6, "completions/max_length": 381.8, "completions/clipped_ratio": 0.0, "kl": 0.11473485874012113, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.247863247863247, "step": 965}, {"loss": 0.004439573734998703, "grad_norm": 0.4698008767472208, "learning_rate": 5.659460856710345e-07, "reward": 2.678125, "reward_std": 0.35614879578351977, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.14312500208616258, "rewards/MazeReward/std": 0.17075003683567047, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.865625, "completions/min_length": 44.8, "completions/max_length": 1040.4, "completions/clipped_ratio": 0.003125, "kl": 0.11099314470775426, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.290598290598291, "step": 970}, {"loss": 0.004387399554252625, "grad_norm": 1.1246814355740533, "learning_rate": 5.618463156346739e-07, "reward": 2.5203125, "reward_std": 0.4758760154247284, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.12718750089406966, "rewards/MazeReward/std": 0.17132539451122283, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.553125, "completions/min_length": 44.4, "completions/max_length": 679.6, "completions/clipped_ratio": 0.0015625, "kl": 0.10966736217960715, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.333333333333334, "step": 975}, {"loss": 0.004825248569250107, "grad_norm": 0.9308115350285304, "learning_rate": 5.577423184847931e-07, "reward": 2.5078125, "reward_std": 0.4877462863922119, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1259375035762787, "rewards/MazeReward/std": 0.17260952889919282, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.0546875, "completions/min_length": 45.0, "completions/max_length": 643.6, "completions/clipped_ratio": 0.0015625, "kl": 0.12060628677718341, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.376068376068377, "step": 980}, {"loss": 0.005096112936735153, "grad_norm": 0.45122353857825026, "learning_rate": 5.536343747241459e-07, "reward": 2.7076171875, "reward_std": 0.2750555261969566, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.1459375023841858, "rewards/MazeReward/std": 0.1707939773797989, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 103.9421875, "completions/min_length": 40.8, "completions/max_length": 306.0, "completions/clipped_ratio": 0.0, "kl": 0.12739773923531175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.418803418803419, "step": 985}, {"loss": 0.0053529292345047, "grad_norm": 0.5176552447530166, "learning_rate": 5.495227651252315e-07, "reward": 2.628125, "reward_std": 0.2354552686214447, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.13781250268220901, "rewards/MazeReward/std": 0.1868099868297577, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.8203125, "completions/min_length": 44.6, "completions/max_length": 326.4, "completions/clipped_ratio": 0.0, "kl": 0.13383744256570934, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.461538461538462, "step": 990}, {"loss": 0.005100805312395096, "grad_norm": 0.7040039230728856, "learning_rate": 5.454077707111041e-07, "reward": 2.84375, "reward_std": 0.34772194027900694, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.15937500447034836, "rewards/MazeReward/std": 0.2054050385951996, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.4109375, "completions/min_length": 46.4, "completions/max_length": 337.4, "completions/clipped_ratio": 0.0, "kl": 0.12751290397718548, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.504273504273504, "step": 995}, {"loss": 0.004598812013864517, "grad_norm": 0.4855991548830803, "learning_rate": 5.412896727361662e-07, "reward": 2.396875, "reward_std": 0.29418938159942626, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.11468749940395355, "rewards/MazeReward/std": 0.15751168578863145, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.3171875, "completions/min_length": 44.6, "completions/max_length": 402.8, "completions/clipped_ratio": 0.0, "kl": 0.11497611114755273, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.547008547008547, "step": 1000}, {"loss": 0.004567617923021317, "grad_norm": 1.1949849654241222, "learning_rate": 5.371687526669439e-07, "reward": 2.446875, "reward_std": 0.5055377662181855, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.11968750357627869, "rewards/MazeReward/std": 0.17761976420879363, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.1109375, "completions/min_length": 48.8, "completions/max_length": 381.6, "completions/clipped_ratio": 0.0, "kl": 0.11416345727629959, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.58974358974359, "step": 1005}, {"loss": 0.0042507462203502655, "grad_norm": 0.8230226369558373, "learning_rate": 5.330452921628497e-07, "reward": 2.765625, "reward_std": 0.5198424816131592, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1515625, "rewards/MazeReward/std": 0.19108597040176392, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.553125, "completions/min_length": 49.6, "completions/max_length": 384.0, "completions/clipped_ratio": 0.0, "kl": 0.10625858008861541, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.632478632478632, "step": 1010}, {"loss": 0.00396508052945137, "grad_norm": 0.683456389574033, "learning_rate": 5.28919573056932e-07, "reward": 2.821875, "reward_std": 0.4673504412174225, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.15718750655651093, "rewards/MazeReward/std": 0.19530395567417144, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.9265625, "completions/min_length": 48.0, "completions/max_length": 403.0, "completions/clipped_ratio": 0.0, "kl": 0.09912157701328397, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.675213675213675, "step": 1015}, {"loss": 0.004162249714136123, "grad_norm": 0.7555265167378185, "learning_rate": 5.247918773366111e-07, "reward": 2.9375, "reward_std": 0.2453582763671875, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.16875, "rewards/MazeReward/std": 0.1709858000278473, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.9421875, "completions/min_length": 47.4, "completions/max_length": 369.2, "completions/clipped_ratio": 0.0, "kl": 0.10403793673031032, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.717948717948717, "step": 1020}, {"loss": 0.004086394608020782, "grad_norm": 0.5237248226742194, "learning_rate": 5.206624871244065e-07, "reward": 2.653125, "reward_std": 0.39622444808483126, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.1403125062584877, "rewards/MazeReward/std": 0.18336015641689302, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.3453125, "completions/min_length": 48.2, "completions/max_length": 445.0, "completions/clipped_ratio": 0.0, "kl": 0.10214884807355702, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.760683760683762, "step": 1025}, {"loss": 0.003640550747513771, "grad_norm": 0.7706414470502965, "learning_rate": 5.165316846586541e-07, "reward": 2.846875, "reward_std": 0.5451147437095643, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1596875011920929, "rewards/MazeReward/std": 0.1916624754667282, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 144.084375, "completions/min_length": 46.2, "completions/max_length": 376.0, "completions/clipped_ratio": 0.0, "kl": 0.09099415931850671, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.803418803418804, "step": 1030}, {"loss": 0.0036819610744714738, "grad_norm": 0.7261257688683561, "learning_rate": 5.123997522742151e-07, "reward": 2.78125, "reward_std": 0.2184166505932808, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.15312500596046447, "rewards/MazeReward/std": 0.19261254966259003, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 151.68125, "completions/min_length": 53.8, "completions/max_length": 493.6, "completions/clipped_ratio": 0.0, "kl": 0.09205408911220729, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.846153846153847, "step": 1035}, {"loss": 0.0038291953504085543, "grad_norm": 1.2739755973652953, "learning_rate": 5.082669723831793e-07, "reward": 2.6623046875, "reward_std": 0.6662474989891052, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.14125000238418578, "rewards/MazeReward/std": 0.21111677289009095, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 142.8265625, "completions/min_length": 50.0, "completions/max_length": 362.6, "completions/clipped_ratio": 0.0, "kl": 0.09573275893926621, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.88888888888889, "step": 1040}, {"loss": 0.003523694723844528, "grad_norm": 1.0205023124388044, "learning_rate": 5.041336274555625e-07, "reward": 2.6294921875, "reward_std": 0.8672631502151489, "frac_reward_zero_std": 0.7, "rewards/MazeReward/mean": 0.13812500089406968, "rewards/MazeReward/std": 0.21768845319747926, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 154.9078125, "completions/min_length": 54.6, "completions/max_length": 406.6, "completions/clipped_ratio": 0.0, "kl": 0.0880844673141837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.931623931623932, "step": 1045}, {"loss": 0.003172917664051056, "grad_norm": 0.8092165727521461, "learning_rate": 5e-07, "reward": 2.503125, "reward_std": 0.5358554720878601, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.12531249970197678, "rewards/MazeReward/std": 0.17834349870681762, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 153.0296875, "completions/min_length": 52.8, "completions/max_length": 379.2, "completions/clipped_ratio": 0.0, "kl": 0.07932715229690075, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 8.974358974358974, "step": 1050}, {"loss": 0.0033974848687648774, "grad_norm": 1.3586878566240224, "learning_rate": 4.958663725444375e-07, "reward": 2.8109375, "reward_std": 0.677154815196991, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.1562500014901161, "rewards/MazeReward/std": 0.19654485285282136, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 152.5109375, "completions/min_length": 55.8, "completions/max_length": 421.0, "completions/clipped_ratio": 0.0, "kl": 0.08491902407258749, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.017094017094017, "step": 1055}, {"loss": 0.003643970936536789, "grad_norm": 0.34243040085731946, "learning_rate": 4.917330276168208e-07, "reward": 2.803125, "reward_std": 0.6713367283344269, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1553125038743019, "rewards/MazeReward/std": 0.20383458137512206, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 146.475, "completions/min_length": 52.0, "completions/max_length": 389.2, "completions/clipped_ratio": 0.0, "kl": 0.0910864389501512, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.05982905982906, "step": 1060}, {"loss": 0.004185883700847626, "grad_norm": 0.875129358025704, "learning_rate": 4.87600247725785e-07, "reward": 3.0625, "reward_std": 0.48923705220222474, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.18125000596046448, "rewards/MazeReward/std": 0.21844838559627533, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 139.878125, "completions/min_length": 49.8, "completions/max_length": 390.2, "completions/clipped_ratio": 0.0, "kl": 0.10464337235316634, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.102564102564102, "step": 1065}, {"loss": 0.004287224635481835, "grad_norm": 0.42757762474877375, "learning_rate": 4.834683153413459e-07, "reward": 2.64375, "reward_std": 0.37823321521282194, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.13937500417232512, "rewards/MazeReward/std": 0.1645449861884117, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.3484375, "completions/min_length": 46.2, "completions/max_length": 419.4, "completions/clipped_ratio": 0.0, "kl": 0.10715236896649002, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.145299145299145, "step": 1070}, {"loss": 0.004490474238991737, "grad_norm": 1.2347467695458751, "learning_rate": 4.793375128755933e-07, "reward": 2.990625, "reward_std": 0.6784021079540252, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.17406250536441803, "rewards/MazeReward/std": 0.21663503348827362, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.9015625, "completions/min_length": 48.8, "completions/max_length": 370.8, "completions/clipped_ratio": 0.0, "kl": 0.11225857324898243, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.188034188034187, "step": 1075}, {"loss": 0.004342434555292129, "grad_norm": 0.8469089772634869, "learning_rate": 4.752081226633888e-07, "reward": 3.11875, "reward_std": 0.5571016371250153, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.18687500953674316, "rewards/MazeReward/std": 0.22037321627140044, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.5484375, "completions/min_length": 53.2, "completions/max_length": 291.4, "completions/clipped_ratio": 0.0, "kl": 0.10855783149600029, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.23076923076923, "step": 1080}, {"loss": 0.0037718590348958967, "grad_norm": 0.7604487155971665, "learning_rate": 4.71080426943068e-07, "reward": 2.740625, "reward_std": 0.49681556224823, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.14906250387430192, "rewards/MazeReward/std": 0.19011184573173523, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.4609375, "completions/min_length": 47.6, "completions/max_length": 329.8, "completions/clipped_ratio": 0.0, "kl": 0.09429952083155513, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.273504273504274, "step": 1085}, {"loss": 0.004375176876783371, "grad_norm": 0.6965466774930933, "learning_rate": 4.669547078371503e-07, "reward": 2.921875, "reward_std": 0.46210750937461853, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.16718750447034836, "rewards/MazeReward/std": 0.19805112481117249, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.4609375, "completions/min_length": 53.2, "completions/max_length": 318.6, "completions/clipped_ratio": 0.0, "kl": 0.10939432140439749, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.316239316239317, "step": 1090}, {"loss": 0.003982530534267425, "grad_norm": 0.7159299198254374, "learning_rate": 4.628312473330562e-07, "reward": 2.7263671875, "reward_std": 0.5215662240982055, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.14781250059604645, "rewards/MazeReward/std": 0.19598830342292786, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 129.3796875, "completions/min_length": 47.2, "completions/max_length": 672.8, "completions/clipped_ratio": 0.0, "kl": 0.09956523487344385, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.35897435897436, "step": 1095}, {"loss": 0.004355636984109878, "grad_norm": 0.8152789797212142, "learning_rate": 4.5871032726383385e-07, "reward": 2.590625, "reward_std": 0.312482450902462, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1340624988079071, "rewards/MazeReward/std": 0.1762324720621109, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.275, "completions/min_length": 51.4, "completions/max_length": 349.4, "completions/clipped_ratio": 0.0, "kl": 0.10890261642634869, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.401709401709402, "step": 1100}, {"loss": 0.004354207217693329, "grad_norm": 1.0145942446437635, "learning_rate": 4.5459222928889587e-07, "reward": 2.721875, "reward_std": 0.2637569785118103, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.14718750417232512, "rewards/MazeReward/std": 0.19373294115066528, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.2421875, "completions/min_length": 47.4, "completions/max_length": 332.2, "completions/clipped_ratio": 0.0, "kl": 0.108840207522735, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.444444444444445, "step": 1105}, {"loss": 0.004366296529769898, "grad_norm": 1.1849331018616072, "learning_rate": 4.5047723487476864e-07, "reward": 2.540625, "reward_std": 0.5292814433574676, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.1290624976158142, "rewards/MazeReward/std": 0.2007750853896141, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.5453125, "completions/min_length": 50.0, "completions/max_length": 339.6, "completions/clipped_ratio": 0.0, "kl": 0.10914051588624715, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.487179487179487, "step": 1110}, {"loss": 0.0040661245584487915, "grad_norm": 1.286125065843318, "learning_rate": 4.463656252758542e-07, "reward": 2.771875, "reward_std": 0.4567874908447266, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1521875038743019, "rewards/MazeReward/std": 0.18613163828849794, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 139.1921875, "completions/min_length": 45.0, "completions/max_length": 391.0, "completions/clipped_ratio": 0.0, "kl": 0.10163588528521358, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.52991452991453, "step": 1115}, {"loss": 0.004218711704015732, "grad_norm": 1.5016589865441343, "learning_rate": 4.4225768151520694e-07, "reward": 2.7125, "reward_std": 0.5366943567991257, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.14625000357627868, "rewards/MazeReward/std": 0.21170917451381682, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.6546875, "completions/min_length": 50.8, "completions/max_length": 369.6, "completions/clipped_ratio": 0.0, "kl": 0.10546655040234328, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.572649572649572, "step": 1120}, {"loss": 0.0038293473422527312, "grad_norm": 1.1798897236609454, "learning_rate": 4.381536843653261e-07, "reward": 2.8375, "reward_std": 0.7899509906768799, "frac_reward_zero_std": 0.6875, "rewards/MazeReward/mean": 0.15875000059604644, "rewards/MazeReward/std": 0.20503087043762208, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 138.1640625, "completions/min_length": 50.4, "completions/max_length": 336.8, "completions/clipped_ratio": 0.0, "kl": 0.09572657975368201, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.615384615384615, "step": 1125}, {"loss": 0.003905288875102997, "grad_norm": 0.6998974993851493, "learning_rate": 4.340539143289655e-07, "reward": 2.571875, "reward_std": 0.45505390167236326, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.13218750134110452, "rewards/MazeReward/std": 0.17721938192844391, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.64375, "completions/min_length": 48.4, "completions/max_length": 363.8, "completions/clipped_ratio": 0.0, "kl": 0.09763574441894889, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.658119658119658, "step": 1130}, {"loss": 0.003902355581521988, "grad_norm": 0.5155982985374374, "learning_rate": 4.2995865161996104e-07, "reward": 2.928125, "reward_std": 0.5552009463310241, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.1678125038743019, "rewards/MazeReward/std": 0.20430524051189422, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.034375, "completions/min_length": 51.2, "completions/max_length": 402.2, "completions/clipped_ratio": 0.0, "kl": 0.09754181425087154, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.7008547008547, "step": 1135}, {"loss": 0.004526397585868836, "grad_norm": 0.7226746944064936, "learning_rate": 4.258681761440789e-07, "reward": 2.86875, "reward_std": 0.5527862429618835, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.16187500059604645, "rewards/MazeReward/std": 0.2129779577255249, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.05625, "completions/min_length": 52.4, "completions/max_length": 314.2, "completions/clipped_ratio": 0.0, "kl": 0.11315394174307584, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.743589743589745, "step": 1140}, {"loss": 0.0042602140456438065, "grad_norm": 1.3399570894615993, "learning_rate": 4.2178276747988444e-07, "reward": 3.1888671875, "reward_std": 0.47525117695331576, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.19406250417232512, "rewards/MazeReward/std": 0.21473246216773986, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 128.296875, "completions/min_length": 49.4, "completions/max_length": 388.6, "completions/clipped_ratio": 0.0, "kl": 0.106480473279953, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.786324786324787, "step": 1145}, {"loss": 0.004187341779470444, "grad_norm": 0.09503596331402567, "learning_rate": 4.1770270485963294e-07, "reward": 2.778125, "reward_std": 0.43286781907081606, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.15281250178813935, "rewards/MazeReward/std": 0.19566009640693666, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.440625, "completions/min_length": 49.0, "completions/max_length": 400.6, "completions/clipped_ratio": 0.0, "kl": 0.10468061766587197, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.82905982905983, "step": 1150}, {"loss": 0.004206154868006707, "grad_norm": 0.7235398012781328, "learning_rate": 4.1362826715018497e-07, "reward": 2.775, "reward_std": 0.40023252964019773, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.15250000655651091, "rewards/MazeReward/std": 0.20179781019687654, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.01875, "completions/min_length": 48.6, "completions/max_length": 412.2, "completions/clipped_ratio": 0.0, "kl": 0.10513886674307286, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.871794871794872, "step": 1155}, {"loss": 0.0038186319172382353, "grad_norm": 0.3737954120293668, "learning_rate": 4.095597328339452e-07, "reward": 2.584375, "reward_std": 0.29082661867141724, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.13343750238418578, "rewards/MazeReward/std": 0.19001711010932923, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.5703125, "completions/min_length": 50.4, "completions/max_length": 358.4, "completions/clipped_ratio": 0.0, "kl": 0.09545613052323461, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.914529914529915, "step": 1160}, {"loss": 0.003987422212958336, "grad_norm": 0.06752438787043205, "learning_rate": 4.0549737998982994e-07, "reward": 2.4984375, "reward_std": 0.26228815913200376, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.12499999850988389, "rewards/MazeReward/std": 0.16459157764911653, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.8875, "completions/min_length": 49.8, "completions/max_length": 522.4, "completions/clipped_ratio": 0.0, "kl": 0.09968637404963374, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.957264957264957, "step": 1165}, {"loss": 0.003986567258834839, "grad_norm": 0.875921562762164, "learning_rate": 4.0144148627425986e-07, "reward": 2.490625, "reward_std": 0.464725586771965, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.12406250089406967, "rewards/MazeReward/std": 0.18573529720306398, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.1, "completions/min_length": 51.6, "completions/max_length": 415.6, "completions/clipped_ratio": 0.0, "kl": 0.09964474057778716, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.0, "step": 1170}, {"loss": 0.004234510287642479, "grad_norm": 0.9439809021490256, "learning_rate": 3.973923289021829e-07, "reward": 3.0, "reward_std": 0.6187429428100586, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.17500000596046447, "rewards/MazeReward/std": 0.22677642703056336, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.7765625, "completions/min_length": 49.4, "completions/max_length": 374.0, "completions/clipped_ratio": 0.0, "kl": 0.10586870852857828, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.042735042735043, "step": 1175}, {"loss": 0.00430932566523552, "grad_norm": 0.524992016977372, "learning_rate": 3.9335018462812664e-07, "reward": 3.1625, "reward_std": 0.5555283963680268, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.19125000089406968, "rewards/MazeReward/std": 0.20346350967884064, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2671875, "completions/min_length": 49.0, "completions/max_length": 312.0, "completions/clipped_ratio": 0.0, "kl": 0.1077304735314101, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.085470085470085, "step": 1180}, {"loss": 0.004483478516340256, "grad_norm": 0.8855337385204372, "learning_rate": 3.893153297272828e-07, "reward": 2.740625, "reward_std": 0.713850450515747, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.14906250238418578, "rewards/MazeReward/std": 0.22811269164085388, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.9328125, "completions/min_length": 48.8, "completions/max_length": 327.0, "completions/clipped_ratio": 0.0, "kl": 0.11209777896292508, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.128205128205128, "step": 1185}, {"loss": 0.004512125626206398, "grad_norm": 0.9321602133252843, "learning_rate": 3.8528803997662423e-07, "reward": 2.646875, "reward_std": 0.42052239179611206, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1396875038743019, "rewards/MazeReward/std": 0.19612097144126892, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.625, "completions/min_length": 45.4, "completions/max_length": 319.2, "completions/clipped_ratio": 0.0, "kl": 0.11280471049249172, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.17094017094017, "step": 1190}, {"loss": 0.004368701577186584, "grad_norm": 0.5102698579518901, "learning_rate": 3.812685906360557e-07, "reward": 3.01875, "reward_std": 0.43965511322021483, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.17687500417232513, "rewards/MazeReward/std": 0.1892246425151825, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.7015625, "completions/min_length": 46.6, "completions/max_length": 342.0, "completions/clipped_ratio": 0.0, "kl": 0.10920268264599145, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.213675213675213, "step": 1195}, {"loss": 0.00459345206618309, "grad_norm": 1.1495768744719015, "learning_rate": 3.772572564296004e-07, "reward": 2.9375, "reward_std": 0.5186841666698456, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.16875000298023224, "rewards/MazeReward/std": 0.21820463240146637, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.0359375, "completions/min_length": 46.8, "completions/max_length": 362.8, "completions/clipped_ratio": 0.0, "kl": 0.11481720227748156, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.256410256410255, "step": 1200}, {"loss": 0.004383664578199387, "grad_norm": 0.6393428851714967, "learning_rate": 3.7325431152662294e-07, "reward": 2.653125, "reward_std": 0.3888654768466949, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.14031250178813934, "rewards/MazeReward/std": 0.18066073656082154, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.940625, "completions/min_length": 48.4, "completions/max_length": 345.6, "completions/clipped_ratio": 0.0, "kl": 0.10959113240242005, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.2991452991453, "step": 1205}, {"loss": 0.004426059126853943, "grad_norm": 0.656354448868076, "learning_rate": 3.692600295230901e-07, "reward": 2.778125, "reward_std": 0.4997374087572098, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.15281250029802323, "rewards/MazeReward/std": 0.18737704753875734, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.88125, "completions/min_length": 48.2, "completions/max_length": 313.4, "completions/clipped_ratio": 0.0, "kl": 0.1106418407522142, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.341880341880342, "step": 1210}, {"loss": 0.004874389618635178, "grad_norm": 0.6022781128494344, "learning_rate": 3.6527468342287096e-07, "reward": 2.503125, "reward_std": 0.5901626765727996, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1253125011920929, "rewards/MazeReward/std": 0.19279995262622834, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.975, "completions/min_length": 47.6, "completions/max_length": 364.8, "completions/clipped_ratio": 0.0, "kl": 0.1218629932962358, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.384615384615385, "step": 1215}, {"loss": 0.004882078245282173, "grad_norm": 1.0773650446880119, "learning_rate": 3.612985456190778e-07, "reward": 2.84375, "reward_std": 0.3281531363725662, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.15937500149011613, "rewards/MazeReward/std": 0.19335278272628784, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.75, "completions/min_length": 49.0, "completions/max_length": 355.2, "completions/clipped_ratio": 0.0, "kl": 0.12203081138432026, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.427350427350428, "step": 1220}, {"loss": 0.004895142465829849, "grad_norm": 0.45989035827119584, "learning_rate": 3.5733188787544746e-07, "reward": 2.425, "reward_std": 0.30953381955623627, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.11750000417232513, "rewards/MazeReward/std": 0.17827851474285125, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.4015625, "completions/min_length": 49.4, "completions/max_length": 360.4, "completions/clipped_ratio": 0.0, "kl": 0.12238692920655012, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.47008547008547, "step": 1225}, {"loss": 0.004211057722568512, "grad_norm": 0.4765218732894405, "learning_rate": 3.533749813077677e-07, "reward": 2.9625, "reward_std": 0.44157418608665466, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.17124999761581422, "rewards/MazeReward/std": 0.20729261934757232, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.1125, "completions/min_length": 47.8, "completions/max_length": 329.0, "completions/clipped_ratio": 0.0, "kl": 0.10526276314631104, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.512820512820513, "step": 1230}, {"loss": 0.004125615209341049, "grad_norm": 0.06937887152965579, "learning_rate": 3.4942809636534633e-07, "reward": 2.4125, "reward_std": 0.35109097361564634, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.11625000089406967, "rewards/MazeReward/std": 0.17364375293254852, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.059375, "completions/min_length": 46.0, "completions/max_length": 376.4, "completions/clipped_ratio": 0.0, "kl": 0.10313204862177372, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.555555555555555, "step": 1235}, {"loss": 0.004409436881542206, "grad_norm": 1.150941241555127, "learning_rate": 3.454915028125263e-07, "reward": 3.0859375, "reward_std": 0.5376386404037475, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.1837500035762787, "rewards/MazeReward/std": 0.20490942895412445, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.6140625, "completions/min_length": 48.6, "completions/max_length": 664.0, "completions/clipped_ratio": 0.0, "kl": 0.110233462927863, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.598290598290598, "step": 1240}, {"loss": 0.004092198982834816, "grad_norm": 0.5906612685205014, "learning_rate": 3.415654697102478e-07, "reward": 2.975, "reward_std": 0.41274300813674925, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.17250000387430192, "rewards/MazeReward/std": 0.20381629168987275, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.146875, "completions/min_length": 44.8, "completions/max_length": 379.6, "completions/clipped_ratio": 0.0, "kl": 0.10231563309207559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.64102564102564, "step": 1245}, {"loss": 0.004274631291627884, "grad_norm": 1.092725237784792, "learning_rate": 3.3765026539765827e-07, "reward": 3.0890625, "reward_std": 0.4289448082447052, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.18406250476837158, "rewards/MazeReward/std": 0.2021147519350052, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.378125, "completions/min_length": 46.0, "completions/max_length": 712.8, "completions/clipped_ratio": 0.0015625, "kl": 0.10686524836346507, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.683760683760683, "step": 1250}, {"loss": 0.004324822500348091, "grad_norm": 1.315849668651416, "learning_rate": 3.337461574737716e-07, "reward": 2.809375, "reward_std": 0.5549110531806946, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1559375047683716, "rewards/MazeReward/std": 0.2119748830795288, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.9953125, "completions/min_length": 45.8, "completions/max_length": 354.8, "completions/clipped_ratio": 0.0, "kl": 0.10811927812173963, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.726495726495726, "step": 1255}, {"loss": 0.004287507385015488, "grad_norm": 1.0598547332375918, "learning_rate": 3.2985341277917846e-07, "reward": 2.8419921875, "reward_std": 0.6813661992549896, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.159375, "rewards/MazeReward/std": 0.2184309720993042, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 119.8640625, "completions/min_length": 45.4, "completions/max_length": 376.6, "completions/clipped_ratio": 0.0, "kl": 0.10716007966548205, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.76923076923077, "step": 1260}, {"loss": 0.004745027422904969, "grad_norm": 1.0596037288036213, "learning_rate": 3.2597229737780774e-07, "reward": 2.846875, "reward_std": 0.6191704094409942, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1596875011920929, "rewards/MazeReward/std": 0.21988584697246552, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.6578125, "completions/min_length": 46.2, "completions/max_length": 343.2, "completions/clipped_ratio": 0.0, "kl": 0.11861755037680269, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.811965811965813, "step": 1265}, {"loss": 0.004420583695173263, "grad_norm": 0.8639156915181337, "learning_rate": 3.221030765387417e-07, "reward": 2.965625, "reward_std": 0.526892964541912, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.17156250178813934, "rewards/MazeReward/std": 0.19438618421554565, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2828125, "completions/min_length": 47.4, "completions/max_length": 338.4, "completions/clipped_ratio": 0.0, "kl": 0.11051362464204431, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.854700854700855, "step": 1270}, {"loss": 0.00436118096113205, "grad_norm": 1.015274606144164, "learning_rate": 3.1824601471808497e-07, "reward": 2.9625, "reward_std": 0.2762803971767426, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.17125000357627868, "rewards/MazeReward/std": 0.20243596732616426, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.58125, "completions/min_length": 47.4, "completions/max_length": 323.8, "completions/clipped_ratio": 0.0, "kl": 0.10902460129000247, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.897435897435898, "step": 1275}, {"loss": 0.0040006622672081, "grad_norm": 0.8250612326370127, "learning_rate": 3.1440137554088953e-07, "reward": 2.6875, "reward_std": 0.5732998341321945, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.14375000596046447, "rewards/MazeReward/std": 0.18184928297996522, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.1890625, "completions/min_length": 46.0, "completions/max_length": 334.2, "completions/clipped_ratio": 0.0, "kl": 0.10001396774314344, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.94017094017094, "step": 1280}, {"loss": 0.004220445826649666, "grad_norm": 0.8773745189185551, "learning_rate": 3.1056942178313604e-07, "reward": 2.759375, "reward_std": 0.6357157766819, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.1509375035762787, "rewards/MazeReward/std": 0.1942424565553665, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.153125, "completions/min_length": 47.0, "completions/max_length": 376.2, "completions/clipped_ratio": 0.0, "kl": 0.10550105930306017, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 10.982905982905983, "step": 1285}, {"loss": 0.004067954793572426, "grad_norm": 0.6622838321365806, "learning_rate": 3.06750415353774e-07, "reward": 3.0125, "reward_std": 0.44894702434539796, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1762499988079071, "rewards/MazeReward/std": 0.20109855830669404, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.2421875, "completions/min_length": 46.2, "completions/max_length": 312.8, "completions/clipped_ratio": 0.0, "kl": 0.1016877539921552, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.025641025641026, "step": 1290}, {"loss": 0.0042134784162044525, "grad_norm": 0.9540298385632097, "learning_rate": 3.029446172768193e-07, "reward": 3.209375, "reward_std": 0.7137496113777161, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.19593750238418578, "rewards/MazeReward/std": 0.2500693678855896, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.7203125, "completions/min_length": 47.8, "completions/max_length": 337.4, "completions/clipped_ratio": 0.0, "kl": 0.10531652322970331, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.068376068376068, "step": 1295}, {"loss": 0.003992287442088127, "grad_norm": 0.8697472365022582, "learning_rate": 2.9915228767351535e-07, "reward": 3.35625, "reward_std": 0.40894377380609515, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.2106250047683716, "rewards/MazeReward/std": 0.2234483391046524, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.8359375, "completions/min_length": 46.2, "completions/max_length": 343.6, "completions/clipped_ratio": 0.0, "kl": 0.09979213373735547, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.11111111111111, "step": 1300}, {"loss": 0.004148208349943161, "grad_norm": 0.8038699956286568, "learning_rate": 2.9537368574455303e-07, "reward": 2.81875, "reward_std": 0.17559237778186798, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.15687500238418578, "rewards/MazeReward/std": 0.18436635434627532, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.2, "completions/min_length": 46.8, "completions/max_length": 321.2, "completions/clipped_ratio": 0.0, "kl": 0.10371305770240724, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.153846153846153, "step": 1305}, {"loss": 0.004228459671139717, "grad_norm": 1.1685515632535388, "learning_rate": 2.916090697523549e-07, "reward": 2.9125, "reward_std": 0.6130844354629517, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.16625000387430192, "rewards/MazeReward/std": 0.2074292153120041, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.8859375, "completions/min_length": 45.0, "completions/max_length": 389.4, "completions/clipped_ratio": 0.0, "kl": 0.1057190123014152, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.196581196581196, "step": 1310}, {"loss": 0.0039792083203792575, "grad_norm": 0.9525409732104532, "learning_rate": 2.878586970034232e-07, "reward": 2.715625, "reward_std": 0.6362109780311584, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.14656249880790712, "rewards/MazeReward/std": 0.20846615433692933, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.5921875, "completions/min_length": 44.8, "completions/max_length": 346.8, "completions/clipped_ratio": 0.0, "kl": 0.09947788114659488, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.239316239316238, "step": 1315}, {"loss": 0.003972093015909195, "grad_norm": 1.012743064388902, "learning_rate": 2.841228238307536e-07, "reward": 2.8421875, "reward_std": 0.5440541952848434, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.159375, "rewards/MazeReward/std": 0.20577182173728942, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.5765625, "completions/min_length": 48.0, "completions/max_length": 613.8, "completions/clipped_ratio": 0.0, "kl": 0.09928330578841268, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.282051282051283, "step": 1320}, {"loss": 0.0042315319180488585, "grad_norm": 0.5037648450165342, "learning_rate": 2.8040170557631485e-07, "reward": 2.6625, "reward_std": 0.5498288422822952, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.14124999940395355, "rewards/MazeReward/std": 0.2035118579864502, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.5203125, "completions/min_length": 49.8, "completions/max_length": 345.0, "completions/clipped_ratio": 0.0, "kl": 0.10580486245453358, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.324786324786325, "step": 1325}, {"loss": 0.004185299202799797, "grad_norm": 1.0345783179989916, "learning_rate": 2.7669559657359673e-07, "reward": 2.89375, "reward_std": 0.33696596026420594, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.1643750011920929, "rewards/MazeReward/std": 0.1924690842628479, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.934375, "completions/min_length": 45.0, "completions/max_length": 364.4, "completions/clipped_ratio": 0.0, "kl": 0.10461925230920315, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.367521367521368, "step": 1330}, {"loss": 0.004698439687490463, "grad_norm": 0.6335818415855536, "learning_rate": 2.730047501302266e-07, "reward": 2.909375, "reward_std": 0.5527952641248703, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.16593750119209288, "rewards/MazeReward/std": 0.22284969091415405, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.09375, "completions/min_length": 44.4, "completions/max_length": 341.0, "completions/clipped_ratio": 0.0, "kl": 0.11745278518646955, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.41025641025641, "step": 1335}, {"loss": 0.005062433332204819, "grad_norm": 0.13903078959487353, "learning_rate": 2.6932941851065615e-07, "reward": 3.159375, "reward_std": 0.3380592703819275, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.19093750417232513, "rewards/MazeReward/std": 0.22119204699993134, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 115.3859375, "completions/min_length": 47.0, "completions/max_length": 313.4, "completions/clipped_ratio": 0.0, "kl": 0.12655633548274636, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.452991452991453, "step": 1340}, {"loss": 0.004987946525216102, "grad_norm": 0.972347935406839, "learning_rate": 2.656698529189193e-07, "reward": 2.85, "reward_std": 0.7588642656803131, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.15999999791383743, "rewards/MazeReward/std": 0.20993177592754364, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.978125, "completions/min_length": 44.0, "completions/max_length": 340.4, "completions/clipped_ratio": 0.0, "kl": 0.12469737268984318, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.495726495726496, "step": 1345}, {"loss": 0.005150691419839859, "grad_norm": 0.5534386973615292, "learning_rate": 2.620263034814632e-07, "reward": 2.846875, "reward_std": 0.3248968005180359, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.15968750715255736, "rewards/MazeReward/std": 0.20917012691497802, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 109.29375, "completions/min_length": 45.8, "completions/max_length": 285.8, "completions/clipped_ratio": 0.0, "kl": 0.12874501328915358, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.538461538461538, "step": 1350}, {"loss": 0.004754256457090378, "grad_norm": 0.9414528186973988, "learning_rate": 2.58399019230052e-07, "reward": 3.0359375, "reward_std": 0.4852349281311035, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.17875000238418579, "rewards/MazeReward/std": 0.211050084233284, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 108.83125, "completions/min_length": 45.6, "completions/max_length": 634.2, "completions/clipped_ratio": 0.0015625, "kl": 0.11885394980199634, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.581196581196581, "step": 1355}, {"loss": 0.005104497820138931, "grad_norm": 1.1805673739035394, "learning_rate": 2.547882480847461e-07, "reward": 2.9171875, "reward_std": 0.5961668491363525, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.16687500178813935, "rewards/MazeReward/std": 0.21798062324523926, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.1015625, "completions/min_length": 47.6, "completions/max_length": 666.6, "completions/clipped_ratio": 0.0015625, "kl": 0.12760760858654976, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.623931623931623, "step": 1360}, {"loss": 0.004765240848064423, "grad_norm": 1.1715898226877905, "learning_rate": 2.5119423683695657e-07, "reward": 3.478125, "reward_std": 0.7005012273788452, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.2228125035762787, "rewards/MazeReward/std": 0.25120378732681276, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.7140625, "completions/min_length": 46.8, "completions/max_length": 361.0, "completions/clipped_ratio": 0.0, "kl": 0.1191184351220727, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.666666666666666, "step": 1365}, {"loss": 0.004644834622740746, "grad_norm": 0.8756316340865241, "learning_rate": 2.476172311325783e-07, "reward": 3.053125, "reward_std": 0.5067808628082275, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.18031250536441804, "rewards/MazeReward/std": 0.23398211300373079, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.2796875, "completions/min_length": 47.2, "completions/max_length": 329.0, "completions/clipped_ratio": 0.0, "kl": 0.11611179038882255, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.709401709401709, "step": 1370}, {"loss": 0.0044922705739736555, "grad_norm": 1.2590183342356303, "learning_rate": 2.440574754551996e-07, "reward": 3.503125, "reward_std": 0.6268679082393647, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.2253125101327896, "rewards/MazeReward/std": 0.2107793927192688, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.0765625, "completions/min_length": 46.2, "completions/max_length": 336.0, "completions/clipped_ratio": 0.0, "kl": 0.11228699986822903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.752136752136753, "step": 1375}, {"loss": 0.004810808598995209, "grad_norm": 1.1992929533345873, "learning_rate": 2.4051521310939254e-07, "reward": 2.684375, "reward_std": 0.46283130049705506, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.14343750327825547, "rewards/MazeReward/std": 0.20171170830726623, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.615625, "completions/min_length": 45.6, "completions/max_length": 312.8, "completions/clipped_ratio": 0.0, "kl": 0.12027762932702898, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.794871794871796, "step": 1380}, {"loss": 0.004513732343912125, "grad_norm": 1.0103995796070642, "learning_rate": 2.3699068620408301e-07, "reward": 2.925, "reward_std": 0.6301033198833466, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.16750000268220902, "rewards/MazeReward/std": 0.2202708065509796, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.6328125, "completions/min_length": 48.6, "completions/max_length": 316.2, "completions/clipped_ratio": 0.0, "kl": 0.11282904949039221, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.837606837606838, "step": 1385}, {"loss": 0.004292643815279007, "grad_norm": 0.9374329600105505, "learning_rate": 2.3348413563600323e-07, "reward": 3.290625, "reward_std": 0.557636970281601, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.20406250059604644, "rewards/MazeReward/std": 0.22283052206039428, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.1765625, "completions/min_length": 46.2, "completions/max_length": 355.6, "completions/clipped_ratio": 0.0, "kl": 0.10730244438163936, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.88034188034188, "step": 1390}, {"loss": 0.004503637924790383, "grad_norm": 0.7025300585187501, "learning_rate": 2.2999580107322654e-07, "reward": 2.625, "reward_std": 0.7034467041492463, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.13750000596046447, "rewards/MazeReward/std": 0.2158157378435135, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.728125, "completions/min_length": 48.6, "completions/max_length": 336.4, "completions/clipped_ratio": 0.0, "kl": 0.11257844744250178, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.923076923076923, "step": 1395}, {"loss": 0.004440478980541229, "grad_norm": 0.9630107428037026, "learning_rate": 2.2652592093878665e-07, "reward": 3.009375, "reward_std": 0.6271803140640259, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.17593750655651091, "rewards/MazeReward/std": 0.2160946547985077, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.29375, "completions/min_length": 48.8, "completions/max_length": 351.2, "completions/clipped_ratio": 0.0, "kl": 0.11099038491956889, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 11.965811965811966, "step": 1400}, {"loss": 0.0042601808905601505, "grad_norm": 1.0502469988141634, "learning_rate": 2.2307473239438152e-07, "reward": 3.21875, "reward_std": 0.5823124885559082, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.196875, "rewards/MazeReward/std": 0.2266264945268631, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.184375, "completions/min_length": 46.8, "completions/max_length": 330.0, "completions/clipped_ratio": 0.0, "kl": 0.10649899104610086, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.008547008547009, "step": 1405}, {"loss": 0.004429550841450691, "grad_norm": 1.4774073020856746, "learning_rate": 2.1964247132416368e-07, "reward": 2.475, "reward_std": 0.4280003309249878, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.12250000089406968, "rewards/MazeReward/std": 0.19068382680416107, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.7, "completions/min_length": 50.8, "completions/max_length": 327.4, "completions/clipped_ratio": 0.0, "kl": 0.1107333465013653, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.051282051282051, "step": 1410}, {"loss": 0.004407884925603867, "grad_norm": 0.8023690241930199, "learning_rate": 2.1622937231861822e-07, "reward": 2.846875, "reward_std": 0.7527072727680206, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.1596875011920929, "rewards/MazeReward/std": 0.21881043016910554, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 114.815625, "completions/min_length": 48.4, "completions/max_length": 391.4, "completions/clipped_ratio": 0.0, "kl": 0.11019802512601018, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.094017094017094, "step": 1415}, {"loss": 0.004655531421303749, "grad_norm": 1.1439206208019748, "learning_rate": 2.128356686585282e-07, "reward": 3.15, "reward_std": 0.727622926235199, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.19000000655651092, "rewards/MazeReward/std": 0.23026613295078277, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 110.5859375, "completions/min_length": 47.0, "completions/max_length": 357.8, "completions/clipped_ratio": 0.0, "kl": 0.1163951527327299, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.136752136752136, "step": 1420}, {"loss": 0.004600329324603081, "grad_norm": 0.07746796563807994, "learning_rate": 2.0946159229903088e-07, "reward": 2.984375, "reward_std": 0.4566905677318573, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.1734375, "rewards/MazeReward/std": 0.21304075717926024, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.9546875, "completions/min_length": 48.4, "completions/max_length": 343.6, "completions/clipped_ratio": 0.0, "kl": 0.11500273984856904, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.179487179487179, "step": 1425}, {"loss": 0.004439861327409744, "grad_norm": 0.9638497800872682, "learning_rate": 2.0610737385376348e-07, "reward": 2.915625, "reward_std": 0.7669902503490448, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.16656250059604644, "rewards/MazeReward/std": 0.21833944022655488, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 113.6953125, "completions/min_length": 44.0, "completions/max_length": 340.6, "completions/clipped_ratio": 0.0, "kl": 0.1109996922314167, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.222222222222221, "step": 1430}, {"loss": 0.0046162322163581845, "grad_norm": 0.9058044499170169, "learning_rate": 2.0277324257910106e-07, "reward": 2.8, "reward_std": 1.0353084444999694, "frac_reward_zero_std": 0.6875, "rewards/MazeReward/mean": 0.15499999821186067, "rewards/MazeReward/std": 0.2393491506576538, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.51875, "completions/min_length": 47.2, "completions/max_length": 350.4, "completions/clipped_ratio": 0.0, "kl": 0.11539947343990206, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.264957264957266, "step": 1435}, {"loss": 0.004491125792264938, "grad_norm": 1.4637020383260568, "learning_rate": 1.9945942635848745e-07, "reward": 2.90625, "reward_std": 0.4800775945186615, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.1656249940395355, "rewards/MazeReward/std": 0.20969883799552919, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.1828125, "completions/min_length": 48.0, "completions/max_length": 315.0, "completions/clipped_ratio": 0.0, "kl": 0.11225917679257691, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.307692307692308, "step": 1440}, {"loss": 0.004119713604450226, "grad_norm": 0.6810633086759158, "learning_rate": 1.9616615168685942e-07, "reward": 3.08125, "reward_std": 0.5347940564155579, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.18312500715255736, "rewards/MazeReward/std": 0.22275058329105377, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.4625, "completions/min_length": 46.8, "completions/max_length": 345.2, "completions/clipped_ratio": 0.0, "kl": 0.10299722058698535, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.350427350427351, "step": 1445}, {"loss": 0.004815234988927841, "grad_norm": 0.7530645981845013, "learning_rate": 1.9289364365516607e-07, "reward": 3.05625, "reward_std": 0.2405204713344574, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.18062500655651093, "rewards/MazeReward/std": 0.19829190224409105, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 116.7078125, "completions/min_length": 45.4, "completions/max_length": 322.2, "completions/clipped_ratio": 0.0, "kl": 0.12037030216306448, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.393162393162394, "step": 1450}, {"loss": 0.004600993543863297, "grad_norm": 0.8855228807329661, "learning_rate": 1.896421259349844e-07, "reward": 2.825, "reward_std": 0.35327176451683046, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.15750000178813933, "rewards/MazeReward/std": 0.19714967012405396, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.1796875, "completions/min_length": 45.6, "completions/max_length": 370.0, "completions/clipped_ratio": 0.0, "kl": 0.11501931981183589, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.435897435897436, "step": 1455}, {"loss": 0.00438457727432251, "grad_norm": 1.0707229873925135, "learning_rate": 1.8641182076323148e-07, "reward": 3.20625, "reward_std": 0.5865340948104858, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.1956250011920929, "rewards/MazeReward/std": 0.24408069550991057, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.2359375, "completions/min_length": 50.6, "completions/max_length": 348.6, "completions/clipped_ratio": 0.0, "kl": 0.10961648882366717, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.478632478632479, "step": 1460}, {"loss": 0.004212798923254013, "grad_norm": 0.8344007462948576, "learning_rate": 1.8320294892697475e-07, "reward": 2.86875, "reward_std": 0.47283042669296266, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.16187499910593034, "rewards/MazeReward/std": 0.20645065903663634, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.0171875, "completions/min_length": 48.0, "completions/max_length": 340.0, "completions/clipped_ratio": 0.0, "kl": 0.10531781297177076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.521367521367521, "step": 1465}, {"loss": 0.004427079856395721, "grad_norm": 0.7680317788100143, "learning_rate": 1.8001572974834168e-07, "reward": 2.95625, "reward_std": 0.4864831566810608, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.1706250101327896, "rewards/MazeReward/std": 0.2195913314819336, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.6984375, "completions/min_length": 46.6, "completions/max_length": 344.4, "completions/clipped_ratio": 0.0, "kl": 0.11068193083629012, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.564102564102564, "step": 1470}, {"loss": 0.004366424679756164, "grad_norm": 0.8213676769472825, "learning_rate": 1.768503810695295e-07, "reward": 3.153125, "reward_std": 0.48257118463516235, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.19031250178813935, "rewards/MazeReward/std": 0.22344300746917725, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.28125, "completions/min_length": 48.8, "completions/max_length": 341.8, "completions/clipped_ratio": 0.0, "kl": 0.10914504565298558, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.606837606837606, "step": 1475}, {"loss": 0.004094987362623215, "grad_norm": 0.4507625002743985, "learning_rate": 1.7370711923791564e-07, "reward": 3.3, "reward_std": 0.45387778878211976, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.20500000417232514, "rewards/MazeReward/std": 0.23486512005329133, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.70625, "completions/min_length": 47.6, "completions/max_length": 356.8, "completions/clipped_ratio": 0.0, "kl": 0.1023398591671139, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.649572649572649, "step": 1480}, {"loss": 0.004164828360080719, "grad_norm": 0.49743670335935936, "learning_rate": 1.70586159091271e-07, "reward": 3.1796875, "reward_std": 0.21673506498336792, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.1931249976158142, "rewards/MazeReward/std": 0.21075069308280944, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.1671875, "completions/min_length": 49.4, "completions/max_length": 353.8, "completions/clipped_ratio": 0.0, "kl": 0.10411405102349817, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.692307692307692, "step": 1485}, {"loss": 0.0042341239750385284, "grad_norm": 1.310487091991775, "learning_rate": 1.674877139430758e-07, "reward": 3.259375, "reward_std": 0.8526325225830078, "frac_reward_zero_std": 0.725, "rewards/MazeReward/mean": 0.20093750655651094, "rewards/MazeReward/std": 0.23574597835540773, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.65625, "completions/min_length": 49.2, "completions/max_length": 417.4, "completions/clipped_ratio": 0.0, "kl": 0.10584182045422494, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.735042735042736, "step": 1490}, {"loss": 0.0041828140616416935, "grad_norm": 1.1524197214167893, "learning_rate": 1.6441199556794034e-07, "reward": 3.3875, "reward_std": 0.8112789869308472, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.21375000178813935, "rewards/MazeReward/std": 0.24604512751102448, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.7375, "completions/min_length": 49.0, "completions/max_length": 435.8, "completions/clipped_ratio": 0.0, "kl": 0.10455569853074849, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.777777777777779, "step": 1495}, {"loss": 0.004102487117052078, "grad_norm": 1.0393822846213643, "learning_rate": 1.6135921418712955e-07, "reward": 3.134375, "reward_std": 0.595169198513031, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.1884375035762787, "rewards/MazeReward/std": 0.22971762716770172, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.11875, "completions/min_length": 49.6, "completions/max_length": 458.0, "completions/clipped_ratio": 0.0, "kl": 0.10254335454665124, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.820512820512821, "step": 1500}, {"loss": 0.0040058080106973645, "grad_norm": 1.0572866774601455, "learning_rate": 1.5832957845419582e-07, "reward": 3.2625, "reward_std": 0.4272655785083771, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.20125000178813934, "rewards/MazeReward/std": 0.20636436045169831, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.5453125, "completions/min_length": 46.4, "completions/max_length": 359.6, "completions/clipped_ratio": 0.0, "kl": 0.10014389199204743, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.863247863247864, "step": 1505}, {"loss": 0.00405019074678421, "grad_norm": 0.9228517594432902, "learning_rate": 1.553232954407171e-07, "reward": 2.9953125, "reward_std": 0.6427501201629638, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.17468750476837158, "rewards/MazeReward/std": 0.2177804708480835, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.6796875, "completions/min_length": 48.0, "completions/max_length": 713.0, "completions/clipped_ratio": 0.0015625, "kl": 0.1012410223018378, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.905982905982906, "step": 1510}, {"loss": 0.004200476035475731, "grad_norm": 0.7058154484867393, "learning_rate": 1.52340570622144e-07, "reward": 3.4466796875, "reward_std": 0.516084223985672, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.21968749761581421, "rewards/MazeReward/std": 0.23806648850440978, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 134.66875, "completions/min_length": 48.0, "completions/max_length": 389.0, "completions/clipped_ratio": 0.0, "kl": 0.10501653309911489, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.948717948717949, "step": 1515}, {"loss": 0.004174098372459412, "grad_norm": 0.44572165390446444, "learning_rate": 1.493816078637557e-07, "reward": 3.203125, "reward_std": 0.4287213921546936, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.19531250298023223, "rewards/MazeReward/std": 0.24020220935344697, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.31875, "completions/min_length": 48.6, "completions/max_length": 415.6, "completions/clipped_ratio": 0.0, "kl": 0.1043540752492845, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 12.991452991452991, "step": 1520}, {"loss": 0.004134422540664673, "grad_norm": 1.1601946120753088, "learning_rate": 1.4644660940672627e-07, "reward": 3.31875, "reward_std": 0.7683353304862977, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.2068750023841858, "rewards/MazeReward/std": 0.25922371447086334, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.5453125, "completions/min_length": 50.6, "completions/max_length": 430.4, "completions/clipped_ratio": 0.0, "kl": 0.10335386814549566, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.034188034188034, "step": 1525}, {"loss": 0.004078666120767594, "grad_norm": 0.9508085146281441, "learning_rate": 1.435357758543015e-07, "reward": 3.553125, "reward_std": 0.7546874701976776, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.23031250536441802, "rewards/MazeReward/std": 0.2488747239112854, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.1671875, "completions/min_length": 47.6, "completions/max_length": 354.8, "completions/clipped_ratio": 0.0, "kl": 0.10196628724224865, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.076923076923077, "step": 1530}, {"loss": 0.003982898965477944, "grad_norm": 0.9270066606620684, "learning_rate": 1.4064930615808806e-07, "reward": 3.271875, "reward_std": 0.40897447764873507, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.20218750536441804, "rewards/MazeReward/std": 0.2285274773836136, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 132.328125, "completions/min_length": 51.2, "completions/max_length": 342.6, "completions/clipped_ratio": 0.0, "kl": 0.09956136830151081, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.11965811965812, "step": 1535}, {"loss": 0.003968718647956848, "grad_norm": 0.612416092717352, "learning_rate": 1.3778739760445552e-07, "reward": 2.96875, "reward_std": 0.7903600454330444, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.171875, "rewards/MazeReward/std": 0.23524323403835296, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 135.9625, "completions/min_length": 50.6, "completions/max_length": 441.8, "completions/clipped_ratio": 0.0, "kl": 0.09921041326597332, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.162393162393162, "step": 1540}, {"loss": 0.004155702888965607, "grad_norm": 1.0284612331588838, "learning_rate": 1.349502458010519e-07, "reward": 3.121875, "reward_std": 0.496600079536438, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.18718750178813934, "rewards/MazeReward/std": 0.24737907946109772, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 137.615625, "completions/min_length": 50.0, "completions/max_length": 379.4, "completions/clipped_ratio": 0.0, "kl": 0.10387767017818987, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.205128205128204, "step": 1545}, {"loss": 0.004071044176816941, "grad_norm": 1.1606217573456257, "learning_rate": 1.321380446634342e-07, "reward": 3.0576171875, "reward_std": 0.622650396823883, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.18093750774860382, "rewards/MazeReward/std": 0.23252680003643036, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 135.1421875, "completions/min_length": 48.0, "completions/max_length": 413.4, "completions/clipped_ratio": 0.0, "kl": 0.10176367992535233, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.247863247863247, "step": 1550}, {"loss": 0.0042334794998168945, "grad_norm": 0.05934888405961084, "learning_rate": 1.2935098640181457e-07, "reward": 3.5515625, "reward_std": 0.564618581533432, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.23031250536441802, "rewards/MazeReward/std": 0.23733251690864562, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.353125, "completions/min_length": 52.4, "completions/max_length": 402.6, "completions/clipped_ratio": 0.0, "kl": 0.10582958087325096, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.290598290598291, "step": 1555}, {"loss": 0.00375819131731987, "grad_norm": 0.8967278127436061, "learning_rate": 1.2658926150792322e-07, "reward": 3.31875, "reward_std": 0.811944055557251, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.20687500536441802, "rewards/MazeReward/std": 0.2688955247402191, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 140.5203125, "completions/min_length": 54.2, "completions/max_length": 381.8, "completions/clipped_ratio": 0.0, "kl": 0.09393652775324882, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.333333333333334, "step": 1560}, {"loss": 0.003957881778478623, "grad_norm": 0.6689887124628756, "learning_rate": 1.2385305874198775e-07, "reward": 3.55, "reward_std": 0.3778967708349228, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.23000000715255736, "rewards/MazeReward/std": 0.22691603899002075, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 143.8890625, "completions/min_length": 51.4, "completions/max_length": 452.0, "completions/clipped_ratio": 0.0, "kl": 0.09893502001650631, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.376068376068377, "step": 1565}, {"loss": 0.003567250818014145, "grad_norm": 0.8305684758545985, "learning_rate": 1.2114256511983274e-07, "reward": 3.24375, "reward_std": 0.5553144633769989, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.19937500953674317, "rewards/MazeReward/std": 0.237774994969368, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 136.734375, "completions/min_length": 49.6, "completions/max_length": 350.2, "completions/clipped_ratio": 0.0, "kl": 0.08917890437878669, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.418803418803419, "step": 1570}, {"loss": 0.0041314858943223955, "grad_norm": 0.6341620757128188, "learning_rate": 1.1845796590009683e-07, "reward": 3.425, "reward_std": 0.6744720757007598, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.21750000417232512, "rewards/MazeReward/std": 0.25507332682609557, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.6171875, "completions/min_length": 55.0, "completions/max_length": 365.2, "completions/clipped_ratio": 0.0, "kl": 0.1032931875437498, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.461538461538462, "step": 1575}, {"loss": 0.0038429252803325654, "grad_norm": 0.4614397177704931, "learning_rate": 1.1579944457157059e-07, "reward": 3.584375, "reward_std": 0.528683266043663, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.2334374964237213, "rewards/MazeReward/std": 0.25245185792446134, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 133.471875, "completions/min_length": 54.2, "completions/max_length": 343.4, "completions/clipped_ratio": 0.0, "kl": 0.09606116027571261, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.504273504273504, "step": 1580}, {"loss": 0.0036599829792976378, "grad_norm": 1.0238721759018097, "learning_rate": 1.1316718284065535e-07, "reward": 3.384375, "reward_std": 0.6477352797985076, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.21375000923871995, "rewards/MazeReward/std": 0.21816131472587585, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490137964487076, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 144.0140625, "completions/min_length": 50.0, "completions/max_length": 727.8, "completions/clipped_ratio": 0.0015625, "kl": 0.09148284844122827, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.547008547008547, "step": 1585}, {"loss": 0.0038771606981754304, "grad_norm": 0.9521251840405882, "learning_rate": 1.1056136061894384e-07, "reward": 3.640625, "reward_std": 0.9544131755828857, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.23906249701976776, "rewards/MazeReward/std": 0.26780156791210175, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 138.340625, "completions/min_length": 55.0, "completions/max_length": 391.0, "completions/clipped_ratio": 0.0, "kl": 0.09690871224738658, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.58974358974359, "step": 1590}, {"loss": 0.0037319328635931014, "grad_norm": 0.04557115783910534, "learning_rate": 1.0798215601092353e-07, "reward": 3.0359375, "reward_std": 0.41956892013549807, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.17875000238418579, "rewards/MazeReward/std": 0.21701315343379973, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 141.0671875, "completions/min_length": 50.2, "completions/max_length": 684.0, "completions/clipped_ratio": 0.0015625, "kl": 0.09328375519253314, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.632478632478632, "step": 1595}, {"loss": 0.0041183046996593475, "grad_norm": 1.2025631515659523, "learning_rate": 1.0542974530180327e-07, "reward": 3.2875, "reward_std": 0.6173423409461976, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.20375000387430192, "rewards/MazeReward/std": 0.22543151676654816, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 131.0828125, "completions/min_length": 49.4, "completions/max_length": 374.2, "completions/clipped_ratio": 0.0, "kl": 0.10294593628495932, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.675213675213675, "step": 1600}, {"loss": 0.004156334698200226, "grad_norm": 0.7796861515777691, "learning_rate": 1.0290430294546448e-07, "reward": 3.3857421875, "reward_std": 0.7448040068149566, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.21375000476837158, "rewards/MazeReward/std": 0.2573282241821289, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 135.4375, "completions/min_length": 46.4, "completions/max_length": 698.4, "completions/clipped_ratio": 0.0015625, "kl": 0.10390158286318182, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.717948717948717, "step": 1605}, {"loss": 0.004261004179716111, "grad_norm": 0.98910838336713, "learning_rate": 1.0040600155253764e-07, "reward": 3.246875, "reward_std": 0.6785987883806228, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.19968749582767487, "rewards/MazeReward/std": 0.2529229700565338, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 130.2, "completions/min_length": 50.4, "completions/max_length": 362.8, "completions/clipped_ratio": 0.0, "kl": 0.10653561083599924, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.760683760683762, "step": 1610}, {"loss": 0.004665160179138183, "grad_norm": 0.6280736093035446, "learning_rate": 9.793501187860431e-08, "reward": 3.175, "reward_std": 0.45869363844394684, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.19250000417232513, "rewards/MazeReward/std": 0.2327058345079422, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.7015625, "completions/min_length": 46.2, "completions/max_length": 367.0, "completions/clipped_ratio": 0.0, "kl": 0.11661676904186606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.803418803418804, "step": 1615}, {"loss": 0.004146835952997208, "grad_norm": 1.1229957421935337, "learning_rate": 9.549150281252632e-08, "reward": 3.365625, "reward_std": 0.585090035200119, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.21156250238418578, "rewards/MazeReward/std": 0.24423626065254211, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.015625, "completions/min_length": 47.8, "completions/max_length": 389.8, "completions/clipped_ratio": 0.0, "kl": 0.10365095781162381, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.846153846153847, "step": 1620}, {"loss": 0.004512753337621689, "grad_norm": 1.3051344498253252, "learning_rate": 9.307564136490254e-08, "reward": 3.371875, "reward_std": 0.6207172274589539, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.21218750774860382, "rewards/MazeReward/std": 0.24975141286849975, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 129.1359375, "completions/min_length": 49.0, "completions/max_length": 431.4, "completions/clipped_ratio": 0.0, "kl": 0.11281033270061017, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.88888888888889, "step": 1625}, {"loss": 0.004270961135625839, "grad_norm": 0.40871228210294197, "learning_rate": 9.068759265665382e-08, "reward": 2.965625, "reward_std": 0.3837372213602066, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.1715625047683716, "rewards/MazeReward/std": 0.21898579895496367, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.565625, "completions/min_length": 47.0, "completions/max_length": 370.0, "completions/clipped_ratio": 0.0, "kl": 0.10675760302692652, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.931623931623932, "step": 1630}, {"loss": 0.004375287890434265, "grad_norm": 0.9853878316290574, "learning_rate": 8.832751990773712e-08, "reward": 2.846875, "reward_std": 0.34098189175128935, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.15968750268220902, "rewards/MazeReward/std": 0.19211819767951965, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.6375, "completions/min_length": 48.8, "completions/max_length": 353.8, "completions/clipped_ratio": 0.0, "kl": 0.10937570687383413, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 13.974358974358974, "step": 1635}, {"loss": 0.004281745105981827, "grad_norm": 0.6220137988989953, "learning_rate": 8.599558442598998e-08, "reward": 3.50625, "reward_std": 0.5477135837078094, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.22562500238418579, "rewards/MazeReward/std": 0.2416780710220337, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.046875, "completions/min_length": 48.0, "completions/max_length": 426.0, "completions/clipped_ratio": 0.0, "kl": 0.10704006981104612, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.017094017094017, "step": 1640}, {"loss": 0.004411678016185761, "grad_norm": 1.1807223503571544, "learning_rate": 8.369194559610481e-08, "reward": 3.28125, "reward_std": 0.5337654531002045, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.20312500596046448, "rewards/MazeReward/std": 0.22464993894100188, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.753125, "completions/min_length": 49.6, "completions/max_length": 339.4, "completions/clipped_ratio": 0.0, "kl": 0.11027584793046116, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.05982905982906, "step": 1645}, {"loss": 0.004653770476579666, "grad_norm": 0.6645196225456695, "learning_rate": 8.141676086873573e-08, "reward": 3.659375, "reward_std": 0.4128348171710968, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.2409375011920929, "rewards/MazeReward/std": 0.26526184678077697, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.665625, "completions/min_length": 51.2, "completions/max_length": 317.2, "completions/clipped_ratio": 0.0, "kl": 0.11633407985791563, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.102564102564102, "step": 1650}, {"loss": 0.004602273926138878, "grad_norm": 0.05438863761076759, "learning_rate": 7.917018574973644e-08, "reward": 3.1125, "reward_std": 0.1789622038602829, "frac_reward_zero_std": 0.9375, "rewards/MazeReward/mean": 0.1862500011920929, "rewards/MazeReward/std": 0.21130214631557465, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.334375, "completions/min_length": 47.4, "completions/max_length": 333.6, "completions/clipped_ratio": 0.0, "kl": 0.11503907395526766, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.145299145299145, "step": 1655}, {"loss": 0.004293971136212349, "grad_norm": 0.43688586874881835, "learning_rate": 7.695237378953224e-08, "reward": 3.446875, "reward_std": 0.6536878108978271, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.21968749761581421, "rewards/MazeReward/std": 0.2504511445760727, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.6921875, "completions/min_length": 46.2, "completions/max_length": 335.0, "completions/clipped_ratio": 0.0, "kl": 0.10734036625362933, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.188034188034187, "step": 1660}, {"loss": 0.004189522564411163, "grad_norm": 0.844809871553244, "learning_rate": 7.476347657262455e-08, "reward": 2.959375, "reward_std": 0.48555052280426025, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.17093749940395356, "rewards/MazeReward/std": 0.20453031957149506, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.55625, "completions/min_length": 47.2, "completions/max_length": 381.4, "completions/clipped_ratio": 0.0, "kl": 0.10472417911514639, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.23076923076923, "step": 1665}, {"loss": 0.0039858706295490265, "grad_norm": 0.522968792512682, "learning_rate": 7.260364370723043e-08, "reward": 3.35, "reward_std": 0.4939745903015137, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.21000000536441804, "rewards/MazeReward/std": 0.23409392535686493, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.9421875, "completions/min_length": 48.6, "completions/max_length": 360.0, "completions/clipped_ratio": 0.0, "kl": 0.09963853806257247, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.273504273504274, "step": 1670}, {"loss": 0.004102340340614319, "grad_norm": 1.0720480282615144, "learning_rate": 7.047302281505735e-08, "reward": 3.6546875, "reward_std": 0.7884458363056183, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.2406250059604645, "rewards/MazeReward/std": 0.2581137716770172, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 134.346875, "completions/min_length": 47.8, "completions/max_length": 700.4, "completions/clipped_ratio": 0.0015625, "kl": 0.10253564361482859, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.316239316239317, "step": 1675}, {"loss": 0.004236004501581192, "grad_norm": 0.5287232895639771, "learning_rate": 6.837175952121304e-08, "reward": 3.190625, "reward_std": 0.5882582247257233, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.19406250417232512, "rewards/MazeReward/std": 0.2406783401966095, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.48125, "completions/min_length": 48.6, "completions/max_length": 371.2, "completions/clipped_ratio": 0.0, "kl": 0.10590210733935237, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.35897435897436, "step": 1680}, {"loss": 0.004362818598747253, "grad_norm": 0.8930005912084705, "learning_rate": 6.629999744425235e-08, "reward": 3.359375, "reward_std": 0.7467323809862136, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.2109375, "rewards/MazeReward/std": 0.25135611593723295, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.9703125, "completions/min_length": 49.8, "completions/max_length": 440.4, "completions/clipped_ratio": 0.0, "kl": 0.10905647352337837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.401709401709402, "step": 1685}, {"loss": 0.004146735370159149, "grad_norm": 0.8842969125336764, "learning_rate": 6.42578781863613e-08, "reward": 3.1625, "reward_std": 0.5814606785774231, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.19125000834465028, "rewards/MazeReward/std": 0.23819169998168946, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.4796875, "completions/min_length": 48.0, "completions/max_length": 401.0, "completions/clipped_ratio": 0.0, "kl": 0.10364811704494059, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.444444444444445, "step": 1690}, {"loss": 0.004596540331840515, "grad_norm": 0.7308075090785238, "learning_rate": 6.22455413236786e-08, "reward": 3.821875, "reward_std": 0.9392133474349975, "frac_reward_zero_std": 0.75, "rewards/MazeReward/mean": 0.2571874916553497, "rewards/MazeReward/std": 0.28829089999198915, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.4609375, "completions/min_length": 52.4, "completions/max_length": 376.6, "completions/clipped_ratio": 0.0, "kl": 0.11490621510893106, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.487179487179487, "step": 1695}, {"loss": 0.003949685022234917, "grad_norm": 0.5203656707355103, "learning_rate": 6.026312439675551e-08, "reward": 3.175, "reward_std": 0.47585675716400144, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.19250000417232513, "rewards/MazeReward/std": 0.2218406856060028, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.490625, "completions/min_length": 46.8, "completions/max_length": 327.2, "completions/clipped_ratio": 0.0, "kl": 0.09874770562164485, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.52991452991453, "step": 1700}, {"loss": 0.004355831816792488, "grad_norm": 0.6697032216645531, "learning_rate": 5.831076290115572e-08, "reward": 3.5125, "reward_std": 0.41589419543743134, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.2262500047683716, "rewards/MazeReward/std": 0.24999509155750274, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.51875, "completions/min_length": 46.4, "completions/max_length": 342.8, "completions/clipped_ratio": 0.0, "kl": 0.10887362537905573, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.572649572649572, "step": 1705}, {"loss": 0.004422678798437119, "grad_norm": 0.7010706958799113, "learning_rate": 5.638859027819409e-08, "reward": 3.278125, "reward_std": 0.3859258651733398, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.20281250178813934, "rewards/MazeReward/std": 0.23284580707550048, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.8578125, "completions/min_length": 48.0, "completions/max_length": 348.2, "completions/clipped_ratio": 0.0, "kl": 0.11055122390389442, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.615384615384615, "step": 1710}, {"loss": 0.004326200857758522, "grad_norm": 0.6684899129688294, "learning_rate": 5.44967379058161e-08, "reward": 3.196875, "reward_std": 0.6069713115692139, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.19468750357627868, "rewards/MazeReward/std": 0.23656201660633086, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.209375, "completions/min_length": 46.4, "completions/max_length": 361.2, "completions/clipped_ratio": 0.0, "kl": 0.10815434050746262, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.658119658119658, "step": 1715}, {"loss": 0.004322724044322967, "grad_norm": 0.7944424841316382, "learning_rate": 5.263533508961826e-08, "reward": 3.403125, "reward_std": 0.8790005326271058, "frac_reward_zero_std": 0.7375, "rewards/MazeReward/mean": 0.21531250178813935, "rewards/MazeReward/std": 0.27138482630252836, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.9078125, "completions/min_length": 48.6, "completions/max_length": 348.0, "completions/clipped_ratio": 0.0, "kl": 0.10805548503994941, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.7008547008547, "step": 1720}, {"loss": 0.0043143235146999356, "grad_norm": 0.8311964411594716, "learning_rate": 5.080450905401057e-08, "reward": 3.321875, "reward_std": 0.47051496505737306, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.20718750655651091, "rewards/MazeReward/std": 0.2368873655796051, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 125.675, "completions/min_length": 47.0, "completions/max_length": 353.2, "completions/clipped_ratio": 0.0, "kl": 0.10785716827958822, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.743589743589745, "step": 1725}, {"loss": 0.004096883535385132, "grad_norm": 1.2412365855858876, "learning_rate": 4.9004384933520547e-08, "reward": 3.59375, "reward_std": 0.7541570663452148, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.23437500298023223, "rewards/MazeReward/std": 0.2849434196949005, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.0015625, "completions/min_length": 48.2, "completions/max_length": 362.4, "completions/clipped_ratio": 0.0, "kl": 0.10242009852081538, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.786324786324787, "step": 1730}, {"loss": 0.004359452426433564, "grad_norm": 1.1105087434794185, "learning_rate": 4.723508576424062e-08, "reward": 3.5375, "reward_std": 0.7639246016740799, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.22874999940395355, "rewards/MazeReward/std": 0.24069324731826783, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 117.9265625, "completions/min_length": 48.4, "completions/max_length": 347.4, "completions/clipped_ratio": 0.0, "kl": 0.1089911924675107, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.82905982905983, "step": 1735}, {"loss": 0.004392564296722412, "grad_norm": 0.7388592958462435, "learning_rate": 4.549673247541874e-08, "reward": 3.678125, "reward_std": 0.7469538986682892, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.24281249642372132, "rewards/MazeReward/std": 0.2900205373764038, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.603125, "completions/min_length": 50.2, "completions/max_length": 386.2, "completions/clipped_ratio": 0.0, "kl": 0.10980640817433596, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.871794871794872, "step": 1740}, {"loss": 0.004376126080751419, "grad_norm": 0.43934120657856396, "learning_rate": 4.37894438811931e-08, "reward": 3.409375, "reward_std": 0.2898747891187668, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.21593750417232513, "rewards/MazeReward/std": 0.241091787815094, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2125, "completions/min_length": 46.8, "completions/max_length": 347.4, "completions/clipped_ratio": 0.0, "kl": 0.10940547166392207, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.914529914529915, "step": 1745}, {"loss": 0.004501381888985634, "grad_norm": 1.187885559295768, "learning_rate": 4.2113336672471245e-08, "reward": 3.640625, "reward_std": 0.8781503081321717, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.23906251192092895, "rewards/MazeReward/std": 0.2923721134662628, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.225, "completions/min_length": 48.0, "completions/max_length": 315.2, "completions/clipped_ratio": 0.0, "kl": 0.11253669201396406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 14.957264957264957, "step": 1750}, {"loss": 0.00429476797580719, "grad_norm": 0.8092896972547012, "learning_rate": 4.0468525408954456e-08, "reward": 3.684375, "reward_std": 0.7190027356147766, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.2434375047683716, "rewards/MazeReward/std": 0.2688789367675781, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.45625, "completions/min_length": 44.2, "completions/max_length": 348.2, "completions/clipped_ratio": 0.0, "kl": 0.10736672207713127, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.0, "step": 1755}, {"loss": 0.004502619802951813, "grad_norm": 0.5830990891113115, "learning_rate": 3.8855122511307626e-08, "reward": 3.46875, "reward_std": 0.33688003718853, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.22187500596046447, "rewards/MazeReward/std": 0.24869490265846253, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.8734375, "completions/min_length": 44.6, "completions/max_length": 399.0, "completions/clipped_ratio": 0.0, "kl": 0.11255799029022455, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.042735042735043, "step": 1760}, {"loss": 0.004365786164999008, "grad_norm": 0.4328270413121731, "learning_rate": 3.727323825347578e-08, "reward": 4.0669921875, "reward_std": 0.756450217962265, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.28187501430511475, "rewards/MazeReward/std": 0.27423061430454254, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 128.025, "completions/min_length": 45.0, "completions/max_length": 705.2, "completions/clipped_ratio": 0.0015625, "kl": 0.1091361996717751, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.085470085470085, "step": 1765}, {"loss": 0.004346990585327148, "grad_norm": 0.81299525416062, "learning_rate": 3.572298075514652e-08, "reward": 3.284375, "reward_std": 0.6671978831291199, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.20343750417232515, "rewards/MazeReward/std": 0.24995078146457672, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.975, "completions/min_length": 47.4, "completions/max_length": 363.6, "completions/clipped_ratio": 0.0, "kl": 0.10865907510742545, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.128205128205128, "step": 1770}, {"loss": 0.004198139905929566, "grad_norm": 1.1087066393589085, "learning_rate": 3.420445597436056e-08, "reward": 3.5125, "reward_std": 0.4084074795246124, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.22625000774860382, "rewards/MazeReward/std": 0.2444426268339157, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.7359375, "completions/min_length": 50.6, "completions/max_length": 339.4, "completions/clipped_ratio": 0.0, "kl": 0.1049537037499249, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.17094017094017, "step": 1775}, {"loss": 0.004243284463882446, "grad_norm": 0.7614062817623753, "learning_rate": 3.271776770026963e-08, "reward": 3.178125, "reward_std": 0.4721804141998291, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.19281249791383742, "rewards/MazeReward/std": 0.23438346683979033, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.8203125, "completions/min_length": 47.2, "completions/max_length": 351.2, "completions/clipped_ratio": 0.0, "kl": 0.10607459908351302, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.213675213675213, "step": 1780}, {"loss": 0.004262470826506615, "grad_norm": 0.6127489399171744, "learning_rate": 3.1263017546042326e-08, "reward": 3.496875, "reward_std": 0.36138366907835007, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.22468750476837157, "rewards/MazeReward/std": 0.23243287205696106, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.6671875, "completions/min_length": 49.4, "completions/max_length": 389.2, "completions/clipped_ratio": 0.0, "kl": 0.10655979104340077, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.256410256410255, "step": 1785}, {"loss": 0.004617030173540116, "grad_norm": 0.967510995638688, "learning_rate": 2.9840304941919416e-08, "reward": 3.2, "reward_std": 0.36295809745788576, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.1949999988079071, "rewards/MazeReward/std": 0.2410827934741974, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.134375, "completions/min_length": 50.0, "completions/max_length": 376.4, "completions/clipped_ratio": 0.0, "kl": 0.115428361389786, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.2991452991453, "step": 1790}, {"loss": 0.004285677149891853, "grad_norm": 0.5796572302503418, "learning_rate": 2.8449727128417367e-08, "reward": 3.521875, "reward_std": 0.4946269363164902, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.22718750834465026, "rewards/MazeReward/std": 0.26889726221561433, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.8078125, "completions/min_length": 46.2, "completions/max_length": 391.2, "completions/clipped_ratio": 0.0, "kl": 0.10713190361857414, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.341880341880342, "step": 1795}, {"loss": 0.0039296291768550875, "grad_norm": 0.05075725166593872, "learning_rate": 2.7091379149682682e-08, "reward": 3.671875, "reward_std": 0.5362778604030609, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.2421875089406967, "rewards/MazeReward/std": 0.2428381234407425, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.8359375, "completions/min_length": 49.2, "completions/max_length": 319.8, "completions/clipped_ratio": 0.0, "kl": 0.0982246644794941, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.384615384615385, "step": 1800}, {"loss": 0.004575078934431076, "grad_norm": 0.7072293760700058, "learning_rate": 2.5765353846995297e-08, "reward": 3.53125, "reward_std": 0.45395667403936385, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.22812499701976777, "rewards/MazeReward/std": 0.2579535335302353, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.79375, "completions/min_length": 45.6, "completions/max_length": 334.6, "completions/clipped_ratio": 0.0, "kl": 0.11436028825119138, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.427350427350428, "step": 1805}, {"loss": 0.0047042079269886015, "grad_norm": 1.1385185290789412, "learning_rate": 2.4471741852423233e-08, "reward": 4.0, "reward_std": 0.6519853830337524, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.2750000089406967, "rewards/MazeReward/std": 0.28275521993637087, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.3171875, "completions/min_length": 51.8, "completions/max_length": 322.8, "completions/clipped_ratio": 0.0, "kl": 0.11759204547852278, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.47008547008547, "step": 1810}, {"loss": 0.004226198792457581, "grad_norm": 0.9564171205522918, "learning_rate": 2.3210631582627927e-08, "reward": 3.725, "reward_std": 0.5720295608043671, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.24750000089406968, "rewards/MazeReward/std": 0.2586837708950043, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.0671875, "completions/min_length": 47.2, "completions/max_length": 311.2, "completions/clipped_ratio": 0.0, "kl": 0.10564614557661116, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.512820512820513, "step": 1815}, {"loss": 0.004193376749753952, "grad_norm": 1.2406529941181959, "learning_rate": 2.1982109232821176e-08, "reward": 3.56875, "reward_std": 0.5979138255119324, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.2318750023841858, "rewards/MazeReward/std": 0.26024834215641024, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.2984375, "completions/min_length": 44.0, "completions/max_length": 330.2, "completions/clipped_ratio": 0.0, "kl": 0.10482329577207565, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.555555555555555, "step": 1820}, {"loss": 0.004319808259606361, "grad_norm": 0.7017519406514044, "learning_rate": 2.0786258770873645e-08, "reward": 3.378125, "reward_std": 0.43282521367073057, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.21281250417232514, "rewards/MazeReward/std": 0.2369197577238083, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.415625, "completions/min_length": 44.6, "completions/max_length": 315.0, "completions/clipped_ratio": 0.0, "kl": 0.1080109877511859, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.598290598290598, "step": 1825}, {"loss": 0.004267299920320511, "grad_norm": 0.6632277424765783, "learning_rate": 1.9623161931575926e-08, "reward": 3.559375, "reward_std": 0.5012161135673523, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.23093750476837158, "rewards/MazeReward/std": 0.27313116788864134, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.8125, "completions/min_length": 49.2, "completions/max_length": 344.6, "completions/clipped_ratio": 0.0, "kl": 0.10668526445515454, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.64102564102564, "step": 1830}, {"loss": 0.004633168503642082, "grad_norm": 0.5135442974101795, "learning_rate": 1.849289821105199e-08, "reward": 3.34375, "reward_std": 0.4953270524740219, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.20937500894069672, "rewards/MazeReward/std": 0.255164110660553, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.003125, "completions/min_length": 47.6, "completions/max_length": 348.8, "completions/clipped_ratio": 0.0, "kl": 0.11582731227390468, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.683760683760683, "step": 1835}, {"loss": 0.004230240359902382, "grad_norm": 1.0016497259218642, "learning_rate": 1.7395544861325718e-08, "reward": 3.0966796875, "reward_std": 0.304620361328125, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.18468749821186065, "rewards/MazeReward/std": 0.21719041466712952, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 119.6359375, "completions/min_length": 47.0, "completions/max_length": 300.4, "completions/clipped_ratio": 0.0, "kl": 0.10574633032083511, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.726495726495726, "step": 1840}, {"loss": 0.004225829616189003, "grad_norm": 0.5174454406155103, "learning_rate": 1.6331176885040876e-08, "reward": 3.515625, "reward_std": 0.6882634222507477, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.22656250298023223, "rewards/MazeReward/std": 0.2732086151838303, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 128.990625, "completions/min_length": 47.6, "completions/max_length": 424.6, "completions/clipped_ratio": 0.0, "kl": 0.10564749445766211, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.76923076923077, "step": 1845}, {"loss": 0.004012863337993622, "grad_norm": 0.8255536120984532, "learning_rate": 1.5299867030334813e-08, "reward": 3.371875, "reward_std": 0.7521422803401947, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.2121875047683716, "rewards/MazeReward/std": 0.2528801321983337, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.2875, "completions/min_length": 48.2, "completions/max_length": 341.8, "completions/clipped_ratio": 0.0, "kl": 0.10030470197089017, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.811965811965813, "step": 1850}, {"loss": 0.004221369326114654, "grad_norm": 0.812809914447486, "learning_rate": 1.4301685785866213e-08, "reward": 3.259375, "reward_std": 0.31027054488658906, "frac_reward_zero_std": 0.9125, "rewards/MazeReward/mean": 0.2009375125169754, "rewards/MazeReward/std": 0.2311701625585556, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.8, "completions/min_length": 50.2, "completions/max_length": 343.4, "completions/clipped_ratio": 0.0, "kl": 0.10551224481314421, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.854700854700855, "step": 1855}, {"loss": 0.004395525902509689, "grad_norm": 1.1091420453814027, "learning_rate": 1.3336701375997127e-08, "reward": 4.075, "reward_std": 0.670307207107544, "frac_reward_zero_std": 0.8125, "rewards/MazeReward/mean": 0.2824999988079071, "rewards/MazeReward/std": 0.27602744698524473, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.9515625, "completions/min_length": 47.0, "completions/max_length": 344.8, "completions/clipped_ratio": 0.0, "kl": 0.10986831542104483, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.897435897435898, "step": 1860}, {"loss": 0.004101991280913353, "grad_norm": 0.8145535007785947, "learning_rate": 1.240497975613014e-08, "reward": 3.7375, "reward_std": 0.5223577737808227, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.2487500011920929, "rewards/MazeReward/std": 0.2582669973373413, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.453125, "completions/min_length": 49.8, "completions/max_length": 424.2, "completions/clipped_ratio": 0.0, "kl": 0.10255064629018307, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.94017094017094, "step": 1865}, {"loss": 0.004401101171970368, "grad_norm": 0.06424905878178337, "learning_rate": 1.1506584608200364e-08, "reward": 3.38125, "reward_std": 0.5968389272689819, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.21312500834465026, "rewards/MazeReward/std": 0.2849972754716873, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.9578125, "completions/min_length": 46.6, "completions/max_length": 382.6, "completions/clipped_ratio": 0.0, "kl": 0.1100274601019919, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 15.982905982905983, "step": 1870}, {"loss": 0.004095544293522835, "grad_norm": 0.6487314480801746, "learning_rate": 1.0641577336322761e-08, "reward": 3.975, "reward_std": 0.8413453936576843, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.2725000143051147, "rewards/MazeReward/std": 0.3040153205394745, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 127.4640625, "completions/min_length": 49.2, "completions/max_length": 316.6, "completions/clipped_ratio": 0.0, "kl": 0.10238274387083948, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.025641025641026, "step": 1875}, {"loss": 0.004243911057710647, "grad_norm": 0.49436098483755675, "learning_rate": 9.810017062595321e-09, "reward": 3.334375, "reward_std": 0.4042118787765503, "frac_reward_zero_std": 0.875, "rewards/MazeReward/mean": 0.2084375023841858, "rewards/MazeReward/std": 0.2501231372356415, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.6984375, "completions/min_length": 47.6, "completions/max_length": 422.6, "completions/clipped_ratio": 0.0, "kl": 0.10609784824773669, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.068376068376068, "step": 1880}, {"loss": 0.004356931149959564, "grad_norm": 0.508436919192622, "learning_rate": 9.011960623058201e-09, "reward": 3.59375, "reward_std": 0.6314712464809418, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.23437499403953552, "rewards/MazeReward/std": 0.27095602750778197, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.1859375, "completions/min_length": 50.0, "completions/max_length": 351.0, "completions/clipped_ratio": 0.0, "kl": 0.10891414480283856, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.11111111111111, "step": 1885}, {"loss": 0.004078886285424232, "grad_norm": 0.6664563867287676, "learning_rate": 8.247462563808816e-09, "reward": 3.4625, "reward_std": 0.47230331152677535, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.22125000655651092, "rewards/MazeReward/std": 0.2445346087217331, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.0859375, "completions/min_length": 47.0, "completions/max_length": 387.4, "completions/clipped_ratio": 0.0, "kl": 0.10197192868217826, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.153846153846153, "step": 1890}, {"loss": 0.004487024992704392, "grad_norm": 0.0535281517184304, "learning_rate": 7.516575137274162e-09, "reward": 3.478125, "reward_std": 0.344576370716095, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.2228124976158142, "rewards/MazeReward/std": 0.24724717438220978, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.35, "completions/min_length": 49.4, "completions/max_length": 334.4, "completions/clipped_ratio": 0.0, "kl": 0.11215386167168617, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.196581196581196, "step": 1895}, {"loss": 0.0046449493616819385, "grad_norm": 0.7746570584609815, "learning_rate": 6.819348298638839e-09, "reward": 3.440625, "reward_std": 0.7381389677524567, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.2190625011920929, "rewards/MazeReward/std": 0.282903778553009, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.7515625, "completions/min_length": 49.8, "completions/max_length": 321.6, "completions/clipped_ratio": 0.0, "kl": 0.11612661136314273, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.23931623931624, "step": 1900}, {"loss": 0.004342161864042282, "grad_norm": 0.6463953386999981, "learning_rate": 6.15582970243117e-09, "reward": 3.728125, "reward_std": 0.355919748544693, "frac_reward_zero_std": 0.9, "rewards/MazeReward/mean": 0.24781250953674316, "rewards/MazeReward/std": 0.2368593394756317, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.953125, "completions/min_length": 46.0, "completions/max_length": 318.4, "completions/clipped_ratio": 0.0, "kl": 0.10854203356429934, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.28205128205128, "step": 1905}, {"loss": 0.004704833775758743, "grad_norm": 1.1793486433508549, "learning_rate": 5.526064699265753e-09, "reward": 3.4875, "reward_std": 0.7744836688041687, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.22375000417232513, "rewards/MazeReward/std": 0.27049732208251953, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.671875, "completions/min_length": 48.6, "completions/max_length": 375.2, "completions/clipped_ratio": 0.0, "kl": 0.11762102926149964, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.324786324786324, "step": 1910}, {"loss": 0.004716131463646888, "grad_norm": 0.6733016986298439, "learning_rate": 4.9300963327441044e-09, "reward": 3.565625, "reward_std": 0.5362739384174346, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.23156249821186065, "rewards/MazeReward/std": 0.2886486887931824, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.8703125, "completions/min_length": 44.2, "completions/max_length": 383.0, "completions/clipped_ratio": 0.0, "kl": 0.11788390032015741, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.367521367521366, "step": 1915}, {"loss": 0.004523798450827598, "grad_norm": 1.0382370758710289, "learning_rate": 4.367965336512403e-09, "reward": 3.396875, "reward_std": 0.803810977935791, "frac_reward_zero_std": 0.775, "rewards/MazeReward/mean": 0.21468749940395354, "rewards/MazeReward/std": 0.2716837853193283, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.578125, "completions/min_length": 52.8, "completions/max_length": 324.8, "completions/clipped_ratio": 0.0, "kl": 0.11309504546225072, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.41025641025641, "step": 1920}, {"loss": 0.004392998665571213, "grad_norm": 0.8257139308848667, "learning_rate": 3.8397101314774915e-09, "reward": 3.95625, "reward_std": 0.5920416593551636, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.27062500417232516, "rewards/MazeReward/std": 0.28637204468250277, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.0703125, "completions/min_length": 45.2, "completions/max_length": 327.6, "completions/clipped_ratio": 0.0, "kl": 0.10980163249187172, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.45299145299145, "step": 1925}, {"loss": 0.004151855036616326, "grad_norm": 1.0742469342385899, "learning_rate": 3.3453668231809283e-09, "reward": 3.6044921875, "reward_std": 0.4786112129688263, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.23562500178813933, "rewards/MazeReward/std": 0.25146309435367586, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 127.8203125, "completions/min_length": 46.8, "completions/max_length": 695.0, "completions/clipped_ratio": 0.0015625, "kl": 0.10377971744164824, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.495726495726494, "step": 1930}, {"loss": 0.004088918119668961, "grad_norm": 0.6118948764226286, "learning_rate": 2.8849691993311777e-09, "reward": 3.6875, "reward_std": 0.47385877966880796, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.2437500089406967, "rewards/MazeReward/std": 0.23270266354084015, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 120.9828125, "completions/min_length": 44.8, "completions/max_length": 328.4, "completions/clipped_ratio": 0.0, "kl": 0.1021963557228446, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.53846153846154, "step": 1935}, {"loss": 0.004345090314745903, "grad_norm": 1.2784114354732232, "learning_rate": 2.458548727494292e-09, "reward": 3.659375, "reward_std": 0.7301452726125717, "frac_reward_zero_std": 0.7625, "rewards/MazeReward/mean": 0.24093750715255738, "rewards/MazeReward/std": 0.2568228155374527, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.25625, "completions/min_length": 47.6, "completions/max_length": 349.4, "completions/clipped_ratio": 0.0, "kl": 0.10859406501986087, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.581196581196583, "step": 1940}, {"loss": 0.004222322627902031, "grad_norm": 0.9898730665558909, "learning_rate": 2.066134552943077e-09, "reward": 3.5125, "reward_std": 0.441130667924881, "frac_reward_zero_std": 0.8625, "rewards/MazeReward/mean": 0.22625000178813934, "rewards/MazeReward/std": 0.2487412929534912, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 126.1, "completions/min_length": 48.8, "completions/max_length": 337.2, "completions/clipped_ratio": 0.0, "kl": 0.10555869597010314, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.623931623931625, "step": 1945}, {"loss": 0.0039948724210262295, "grad_norm": 1.0217738859538394, "learning_rate": 1.7077534966650765e-09, "reward": 3.428125, "reward_std": 0.5493013352155686, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.21781250238418579, "rewards/MazeReward/std": 0.21325217485427855, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 119.5265625, "completions/min_length": 45.8, "completions/max_length": 446.0, "completions/clipped_ratio": 0.0, "kl": 0.0998825051356107, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.666666666666668, "step": 1950}, {"loss": 0.004186463728547096, "grad_norm": 0.5884618325504257, "learning_rate": 1.383430053529422e-09, "reward": 3.6060546875, "reward_std": 0.5122001999523491, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.23562500774860382, "rewards/MazeReward/std": 0.26041761338710784, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 121.23125, "completions/min_length": 47.0, "completions/max_length": 349.6, "completions/clipped_ratio": 0.0, "kl": 0.10465789251029492, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.70940170940171, "step": 1955}, {"loss": 0.004532093182206154, "grad_norm": 0.6015423582858093, "learning_rate": 1.0931863906127325e-09, "reward": 3.45, "reward_std": 0.6776583135128021, "frac_reward_zero_std": 0.8, "rewards/MazeReward/mean": 0.22000000178813933, "rewards/MazeReward/std": 0.27564533352851867, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 123.5296875, "completions/min_length": 46.0, "completions/max_length": 337.6, "completions/clipped_ratio": 0.0, "kl": 0.1133018615655601, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.752136752136753, "step": 1960}, {"loss": 0.0046912945806980135, "grad_norm": 0.4046282144429942, "learning_rate": 8.370423456837139e-10, "reward": 3.6125, "reward_std": 0.24019813239574433, "frac_reward_zero_std": 0.925, "rewards/MazeReward/mean": 0.23625001609325408, "rewards/MazeReward/std": 0.22056612372398376, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.4421875, "completions/min_length": 43.2, "completions/max_length": 338.2, "completions/clipped_ratio": 0.0, "kl": 0.11728158215992153, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.794871794871796, "step": 1965}, {"loss": 0.004150073975324631, "grad_norm": 0.18848532642409926, "learning_rate": 6.150154258476314e-10, "reward": 3.2375, "reward_std": 0.39494048357009887, "frac_reward_zero_std": 0.8875, "rewards/MazeReward/mean": 0.19874999821186065, "rewards/MazeReward/std": 0.23839934766292573, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.6640625, "completions/min_length": 48.4, "completions/max_length": 369.6, "completions/clipped_ratio": 0.0, "kl": 0.10375166512094439, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.837606837606838, "step": 1970}, {"loss": 0.004333572089672088, "grad_norm": 1.0049912375133132, "learning_rate": 4.271208063494902e-10, "reward": 3.70625, "reward_std": 0.7638269186019897, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.2456250011920929, "rewards/MazeReward/std": 0.27377059757709504, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.5640625, "completions/min_length": 47.2, "completions/max_length": 347.0, "completions/clipped_ratio": 0.0, "kl": 0.10834184312261641, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.88034188034188, "step": 1975}, {"loss": 0.004268518462777138, "grad_norm": 0.54479139113427, "learning_rate": 2.733713295369755e-10, "reward": 3.378125, "reward_std": 0.5172571033239365, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.21281250268220903, "rewards/MazeReward/std": 0.24584324657917023, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 122.1046875, "completions/min_length": 47.8, "completions/max_length": 335.0, "completions/clipped_ratio": 0.0, "kl": 0.10669711260125041, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.923076923076923, "step": 1980}, {"loss": 0.004564845934510231, "grad_norm": 0.7059688953566956, "learning_rate": 1.53777503982655e-10, "reward": 3.671875, "reward_std": 0.5840405106544495, "frac_reward_zero_std": 0.8375, "rewards/MazeReward/mean": 0.24218750596046448, "rewards/MazeReward/std": 0.2895630538463593, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 124.6546875, "completions/min_length": 47.0, "completions/max_length": 344.0, "completions/clipped_ratio": 0.0, "kl": 0.11410397617146373, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 16.965811965811966, "step": 1985}, {"loss": 0.004448100179433823, "grad_norm": 0.848786610345723, "learning_rate": 6.834750376549791e-11, "reward": 3.9060546875, "reward_std": 0.45252889334224167, "frac_reward_zero_std": 0.85, "rewards/MazeReward/mean": 0.265625, "rewards/MazeReward/std": 0.2678088635206223, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "completions/mean_length": 123.0890625, "completions/min_length": 50.6, "completions/max_length": 316.0, "completions/clipped_ratio": 0.0, "kl": 0.11118856389075518, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 17.00854700854701, "step": 1990}, {"loss": 0.004484037682414055, "grad_norm": 0.7312599594007181, "learning_rate": 1.7087167912710476e-11, "reward": 4.41875, "reward_std": 0.7629148185253143, "frac_reward_zero_std": 0.7875, "rewards/MazeReward/mean": 0.31687500774860383, "rewards/MazeReward/std": 0.3001715898513794, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 121.8, "completions/min_length": 48.4, "completions/max_length": 336.2, "completions/clipped_ratio": 0.0, "kl": 0.11210196618922055, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 17.05128205128205, "step": 1995}, {"loss": 0.00425378829240799, "grad_norm": 0.4560788089531848, "learning_rate": 0.0, "reward": 3.821875, "reward_std": 0.6984930455684661, "frac_reward_zero_std": 0.825, "rewards/MazeReward/mean": 0.25718750059604645, "rewards/MazeReward/std": 0.28713129460811615, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "completions/mean_length": 118.578125, "completions/min_length": 46.0, "completions/max_length": 355.4, "completions/clipped_ratio": 0.0, "kl": 0.1063311716541648, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 17.094017094017094, "step": 2000}, {"train_runtime": 70600.3404, "train_samples_per_second": 3.626, "train_steps_per_second": 0.028, "total_flos": 0.0, "train_loss": 0.0034829180966808055, "epoch": 17.094017094017094, "step": 2000}], "memory": 39.0}