{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 17.094017094017094, "eval_steps": 100.0, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/mean_length": 147.515625, "completions/min_length": 43.0, "epoch": 0.008547008547008548, "frac_reward_zero_std": 0.25, "grad_norm": 2.0850979009306507, "kl": 0.0, "learning_rate": 1e-08, "loss": 5.3551048040390015e-09, "reward": 1.2958984375, "reward_std": 0.5146088600158691, "rewards/Format/mean": 0.2412109375, "rewards/Format/std": 0.04487404227256775, "rewards/MazeFormat/mean": 0.8828125, "rewards/MazeFormat/std": 0.322907418012619, "rewards/MazeReward/mean": 0.01718750037252903, "rewards/MazeReward/std": 0.05627460405230522, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/mean_length": 140.15625, "completions/min_length": 45.75, "epoch": 0.042735042735042736, "frac_reward_zero_std": 0.296875, "grad_norm": 1.6039673774878993, "kl": 0.0005648559153996757, "learning_rate": 5e-08, "loss": 2.261187728436198e-05, "reward": 1.216064453125, "reward_std": 0.4408714398741722, "rewards/Format/mean": 0.231689453125, "rewards/Format/std": 0.06383909657597542, "rewards/MazeFormat/mean": 0.88671875, "rewards/MazeFormat/std": 0.31641268730163574, "rewards/MazeReward/mean": 0.009765625698491931, "rewards/MazeReward/std": 0.04561943328008056, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.6, "completions/mean_length": 145.815625, "completions/min_length": 39.0, "epoch": 0.08547008547008547, "frac_reward_zero_std": 0.35, "grad_norm": 1.473835017850979, "kl": 0.0007853303133742884, "learning_rate": 1e-07, "loss": 3.144462825730443e-05, "reward": 1.2603515625, "reward_std": 0.44822131991386416, "rewards/Format/mean": 0.2369140625, "rewards/Format/std": 0.054740263521671294, "rewards/MazeFormat/mean": 0.9046875, "rewards/MazeFormat/std": 0.2910091012716293, "rewards/MazeReward/mean": 0.011875000246800483, "rewards/MazeReward/std": 0.05252101495862007, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.4, "completions/mean_length": 148.54375, "completions/min_length": 42.6, "epoch": 0.1282051282051282, "frac_reward_zero_std": 0.275, "grad_norm": 2.0583040142314197, "kl": 0.000737494510030956, "learning_rate": 1.5e-07, "loss": 2.9532110784202813e-05, "reward": 1.233203125, "reward_std": 0.3958433210849762, "rewards/Format/mean": 0.233203125, "rewards/Format/std": 0.06156868264079094, "rewards/MazeFormat/mean": 0.9, "rewards/MazeFormat/std": 0.2974621653556824, "rewards/MazeReward/mean": 0.010000000125728548, "rewards/MazeReward/std": 0.04012826085090637, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.8, "completions/mean_length": 141.3703125, "completions/min_length": 47.0, "epoch": 0.17094017094017094, "frac_reward_zero_std": 0.275, "grad_norm": 2.077471563018731, "kl": 0.0008433389681158588, "learning_rate": 2e-07, "loss": 3.3766511478461325e-05, "reward": 1.288671875, "reward_std": 0.481926828622818, "rewards/Format/mean": 0.233984375, "rewards/Format/std": 0.059659218043088914, "rewards/MazeFormat/mean": 0.8953125, "rewards/MazeFormat/std": 0.3048880577087402, "rewards/MazeReward/mean": 0.015937499795109035, "rewards/MazeReward/std": 0.05514752417802811, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.2, "completions/mean_length": 125.58125, "completions/min_length": 39.6, "epoch": 0.21367521367521367, "frac_reward_zero_std": 0.4375, "grad_norm": 1.5830201192961726, "kl": 0.001601347164978506, "learning_rate": 2.5e-07, "loss": 6.408471963368357e-05, "reward": 1.3404296875, "reward_std": 0.375272661447525, "rewards/Format/mean": 0.2419921875, "rewards/Format/std": 0.04338446594774723, "rewards/MazeFormat/mean": 0.9359375, "rewards/MazeFormat/std": 0.24299971163272857, "rewards/MazeReward/mean": 0.0162499999627471, "rewards/MazeReward/std": 0.0570610947906971, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.8, "completions/mean_length": 120.7453125, "completions/min_length": 40.8, "epoch": 0.2564102564102564, "frac_reward_zero_std": 0.5125, "grad_norm": 1.1955216398068267, "kl": 0.004001263665850274, "learning_rate": 3e-07, "loss": 0.00016010119579732419, "reward": 1.33046875, "reward_std": 0.3205643713474274, "rewards/Format/mean": 0.24296875, "rewards/Format/std": 0.04054766036570072, "rewards/MazeFormat/mean": 0.953125, "rewards/MazeFormat/std": 0.21053162813186646, "rewards/MazeReward/mean": 0.013437500456348062, "rewards/MazeReward/std": 0.048377957195043564, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.6, "completions/mean_length": 106.0109375, "completions/min_length": 36.8, "epoch": 0.29914529914529914, "frac_reward_zero_std": 0.675, "grad_norm": 1.3153789735954688, "kl": 0.011635770567227154, "learning_rate": 3.5e-07, "loss": 0.00046548396348953245, "reward": 1.3740234375, "reward_std": 0.23168300092220306, "rewards/Format/mean": 0.2490234375, "rewards/Format/std": 0.008435053564608098, "rewards/MazeFormat/mean": 0.978125, "rewards/MazeFormat/std": 0.14282614290714263, "rewards/MazeReward/mean": 0.014687499776482583, "rewards/MazeReward/std": 0.0510710246860981, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.4, "completions/mean_length": 107.7546875, "completions/min_length": 39.6, "epoch": 0.3418803418803419, "frac_reward_zero_std": 0.7, "grad_norm": 1.5192037310820845, "kl": 0.01894672798225656, "learning_rate": 4e-07, "loss": 0.0007580962032079696, "reward": 1.3908203125, "reward_std": 0.24733528196811677, "rewards/Format/mean": 0.2486328125, "rewards/Format/std": 0.013764306530356407, "rewards/MazeFormat/mean": 0.9859375, "rewards/MazeFormat/std": 0.10508071333169937, "rewards/MazeReward/mean": 0.015625, "rewards/MazeReward/std": 0.05470488891005516, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.4, "completions/mean_length": 108.796875, "completions/min_length": 41.4, "epoch": 0.38461538461538464, "frac_reward_zero_std": 0.725, "grad_norm": 0.9839755046926172, "kl": 0.0196377347339876, "learning_rate": 4.5e-07, "loss": 0.0007857446558773518, "reward": 1.377734375, "reward_std": 0.1983001172542572, "rewards/Format/mean": 0.249609375, "rewards/Format/std": 0.004419417306780815, "rewards/MazeFormat/mean": 0.9875, "rewards/MazeFormat/std": 0.09610848724842072, "rewards/MazeReward/mean": 0.0140625, "rewards/MazeReward/std": 0.04950801432132721, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/mean_length": 105.29375, "completions/min_length": 40.8, "epoch": 0.42735042735042733, "frac_reward_zero_std": 0.775, "grad_norm": 1.2306949614247369, "kl": 0.01729733906686306, "learning_rate": 5e-07, "loss": 0.0006918612867593765, "reward": 1.358984375, "reward_std": 0.16672340631484986, "rewards/Format/mean": 0.249609375, "rewards/Format/std": 0.004419417306780815, "rewards/MazeFormat/mean": 0.9875, "rewards/MazeFormat/std": 0.09960551857948304, "rewards/MazeReward/mean": 0.012187499948777259, "rewards/MazeReward/std": 0.04448289349675179, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/mean_length": 104.796875, "completions/min_length": 38.6, "epoch": 0.4700854700854701, "frac_reward_zero_std": 0.8125, "grad_norm": 1.0980013379578395, "kl": 0.015583514084573835, "learning_rate": 5.5e-07, "loss": 0.0006235324777662754, "reward": 1.3109375, "reward_std": 0.12922156751155853, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9921875, "rewards/MazeFormat/std": 0.06573191285133362, "rewards/MazeReward/mean": 0.006875000079162419, "rewards/MazeReward/std": 0.03500961922109127, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.8, "completions/mean_length": 99.0171875, "completions/min_length": 41.0, "epoch": 0.5128205128205128, "frac_reward_zero_std": 0.775, "grad_norm": 1.3129140370641044, "kl": 0.015794009924866258, "learning_rate": 6e-07, "loss": 0.00063182576559484, "reward": 1.3873046875, "reward_std": 0.2064604544546455, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/MazeReward/mean": 0.014062500186264515, "rewards/MazeReward/std": 0.047212396562099454, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.4, "completions/mean_length": 102.0234375, "completions/min_length": 42.4, "epoch": 0.5555555555555556, "frac_reward_zero_std": 0.775, "grad_norm": 1.0616087417649411, "kl": 0.012760270561557263, "learning_rate": 6.5e-07, "loss": 0.0005105869844555855, "reward": 1.3916015625, "reward_std": 0.19745952961966395, "rewards/Format/mean": 0.2494140625, "rewards/Format/std": 0.0066291259601712225, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.014375000912696122, "rewards/MazeReward/std": 0.04834332019090652, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.6, "completions/mean_length": 115.3265625, "completions/min_length": 44.0, "epoch": 0.5982905982905983, "frac_reward_zero_std": 0.675, "grad_norm": 1.3544242249856053, "kl": 0.007766712602460757, "learning_rate": 7e-07, "loss": 0.0003107914002612233, "reward": 1.4435546875, "reward_std": 0.31596590876579284, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.990625, "rewards/MazeFormat/std": 0.08515809774398804, "rewards/MazeReward/mean": 0.020312500186264515, "rewards/MazeReward/std": 0.06453245431184769, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.6, "completions/mean_length": 115.6609375, "completions/min_length": 46.4, "epoch": 0.6410256410256411, "frac_reward_zero_std": 0.75, "grad_norm": 1.3602426896267505, "kl": 0.0070880687271710485, "learning_rate": 7.5e-07, "loss": 0.000283644744195044, "reward": 1.4341796875, "reward_std": 0.23621928095817565, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490137964487076, "rewards/MazeReward/mean": 0.01875000037252903, "rewards/MazeReward/std": 0.061052392423152926, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 115.4859375, "completions/min_length": 43.0, "epoch": 0.6837606837606838, "frac_reward_zero_std": 0.7, "grad_norm": 1.3509089586341816, "kl": 0.00876753773773089, "learning_rate": 8e-07, "loss": 0.00035074278712272645, "reward": 1.4609375, "reward_std": 0.30132956802845, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257904887199402, "rewards/MazeReward/mean": 0.021562500018626453, "rewards/MazeReward/std": 0.0627759762108326, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.4, "completions/mean_length": 110.1203125, "completions/min_length": 45.0, "epoch": 0.7264957264957265, "frac_reward_zero_std": 0.775, "grad_norm": 1.1044949891253064, "kl": 0.01163666148786433, "learning_rate": 8.499999999999999e-07, "loss": 0.0004655797965824604, "reward": 1.3734375, "reward_std": 0.18065465837717057, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9921875, "rewards/MazeFormat/std": 0.07793438732624054, "rewards/MazeReward/mean": 0.013125000149011612, "rewards/MazeReward/std": 0.04797708801925182, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/mean_length": 88.609375, "completions/min_length": 40.6, "epoch": 0.7692307692307693, "frac_reward_zero_std": 0.775, "grad_norm": 0.996598772694048, "kl": 0.02447981040459126, "learning_rate": 9e-07, "loss": 0.0009792439639568328, "reward": 1.471875, "reward_std": 0.23379422426223756, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/MazeReward/mean": 0.02250000014901161, "rewards/MazeReward/std": 0.06349937170743943, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.2, "completions/mean_length": 79.990625, "completions/min_length": 40.4, "epoch": 0.811965811965812, "frac_reward_zero_std": 0.75, "grad_norm": 1.505195182911895, "kl": 0.03736886349506676, "learning_rate": 9.499999999999999e-07, "loss": 0.0014949593693017959, "reward": 1.5171875, "reward_std": 0.20643335282802583, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.026875000912696123, "rewards/MazeReward/std": 0.06502393409609794, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.4, "completions/mean_length": 80.1984375, "completions/min_length": 41.2, "epoch": 0.8547008547008547, "frac_reward_zero_std": 0.8, "grad_norm": 1.1733766193739803, "kl": 0.041934849717654286, "learning_rate": 1e-06, "loss": 0.0016774306073784827, "reward": 1.5046875, "reward_std": 0.182009756565094, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.025625000521540642, "rewards/MazeReward/std": 0.06656526178121566, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.8, "completions/mean_length": 91.3640625, "completions/min_length": 41.2, "epoch": 0.8974358974358975, "frac_reward_zero_std": 0.7625, "grad_norm": 1.4119164170618441, "kl": 0.03487839815206826, "learning_rate": 9.999829128320873e-07, "loss": 0.0013951731845736504, "reward": 1.4875, "reward_std": 0.22661330699920654, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.023750000447034837, "rewards/MazeReward/std": 0.06286502480506898, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.4, "completions/mean_length": 103.746875, "completions/min_length": 38.6, "epoch": 0.9401709401709402, "frac_reward_zero_std": 0.7625, "grad_norm": 1.2686400994592306, "kl": 0.037058884068392216, "learning_rate": 9.999316524962345e-07, "loss": 0.0014822594821453094, "reward": 1.55625, "reward_std": 0.21021372228860855, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/MazeReward/mean": 0.030937500670552252, "rewards/MazeReward/std": 0.07310352176427841, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.2, "completions/mean_length": 103.7015625, "completions/min_length": 42.6, "epoch": 0.9829059829059829, "frac_reward_zero_std": 0.7375, "grad_norm": 1.0874137167924887, "kl": 0.03958274000324309, "learning_rate": 9.998462224960173e-07, "loss": 0.0015832275152206422, "reward": 1.590625, "reward_std": 0.25334451496601107, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.034062500670552255, "rewards/MazeReward/std": 0.07615574449300766, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.8, "completions/mean_length": 104.0, "completions/min_length": 42.2, "epoch": 1.0256410256410255, "frac_reward_zero_std": 0.725, "grad_norm": 0.7102428626419721, "kl": 0.046541464724577965, "learning_rate": 9.99726628670463e-07, "loss": 0.0018614999949932098, "reward": 1.5640625, "reward_std": 0.2902904152870178, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.031562500074505805, "rewards/MazeReward/std": 0.07654295042157173, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.6, "completions/mean_length": 103.3359375, "completions/min_length": 44.2, "epoch": 1.0683760683760684, "frac_reward_zero_std": 0.8125, "grad_norm": 0.7699986902071484, "kl": 0.05786212412640453, "learning_rate": 9.995728791936505e-07, "loss": 0.002314428612589836, "reward": 1.6576171875, "reward_std": 0.1521604984998703, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257904887199402, "rewards/MazeReward/mean": 0.04125000163912773, "rewards/MazeReward/std": 0.07993590980768203, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.6, "completions/mean_length": 84.5421875, "completions/min_length": 41.4, "epoch": 1.1111111111111112, "frac_reward_zero_std": 0.85, "grad_norm": 1.1483079529637177, "kl": 0.07299737120047212, "learning_rate": 9.993849845741523e-07, "loss": 0.00292002372443676, "reward": 1.675, "reward_std": 0.12972374111413956, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.042500000447034836, "rewards/MazeReward/std": 0.0803371638059616, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.8, "completions/mean_length": 80.1828125, "completions/min_length": 41.2, "epoch": 1.1538461538461537, "frac_reward_zero_std": 0.925, "grad_norm": 0.22525600827587947, "kl": 0.07253276985138654, "learning_rate": 9.991629576543163e-07, "loss": 0.002901558205485344, "reward": 1.475, "reward_std": 0.06849094033241272, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.022500000335276125, "rewards/MazeReward/std": 0.0530305951833725, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.4, "completions/mean_length": 92.290625, "completions/min_length": 40.2, "epoch": 1.1965811965811965, "frac_reward_zero_std": 0.8375, "grad_norm": 0.7531123673947644, "kl": 0.058682880457490684, "learning_rate": 9.989068136093872e-07, "loss": 0.0023476168513298034, "reward": 1.6044921875, "reward_std": 0.1165056936442852, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.035625001043081285, "rewards/MazeReward/std": 0.07613980323076248, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.8, "completions/mean_length": 107.709375, "completions/min_length": 40.4, "epoch": 1.2393162393162394, "frac_reward_zero_std": 0.8125, "grad_norm": 0.9655821885141387, "kl": 0.04674590518698096, "learning_rate": 9.986165699464705e-07, "loss": 0.0018699193373322488, "reward": 1.678125, "reward_std": 0.18339579403400422, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04281250089406967, "rewards/MazeReward/std": 0.08515360653400421, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 106.034375, "completions/min_length": 39.8, "epoch": 1.282051282051282, "frac_reward_zero_std": 0.8625, "grad_norm": 0.8442355822998869, "kl": 0.047533696377649905, "learning_rate": 9.982922465033348e-07, "loss": 0.0019011721014976501, "reward": 1.5875, "reward_std": 0.11563374549150467, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.03375000078231096, "rewards/MazeReward/std": 0.07314120382070541, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.6, "completions/mean_length": 91.5609375, "completions/min_length": 41.4, "epoch": 1.3247863247863247, "frac_reward_zero_std": 0.925, "grad_norm": 0.20366802328438316, "kl": 0.059565877495333555, "learning_rate": 9.979338654470567e-07, "loss": 0.0023824993520975115, "reward": 1.7623046875, "reward_std": 0.05337072163820267, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.051249999180436136, "rewards/MazeReward/std": 0.08585172146558762, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.8, "completions/mean_length": 93.615625, "completions/min_length": 40.2, "epoch": 1.3675213675213675, "frac_reward_zero_std": 0.95, "grad_norm": 0.8988364942792788, "kl": 0.05831799576990306, "learning_rate": 9.975414512725056e-07, "loss": 0.0023328181356191634, "reward": 1.753125, "reward_std": 0.04218914955854416, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.050312502309679985, "rewards/MazeReward/std": 0.08477405905723571, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/mean_length": 96.5296875, "completions/min_length": 41.6, "epoch": 1.4102564102564101, "frac_reward_zero_std": 0.8375, "grad_norm": 0.6869513437818591, "kl": 0.04930293974466622, "learning_rate": 9.971150308006687e-07, "loss": 0.001972428523004055, "reward": 1.6859375, "reward_std": 0.14878431037068368, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.04375000130385161, "rewards/MazeReward/std": 0.08216542676091194, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.4, "completions/mean_length": 96.5640625, "completions/min_length": 39.8, "epoch": 1.452991452991453, "frac_reward_zero_std": 1.0, "grad_norm": 0.2002667446495595, "kl": 0.05207843626849353, "learning_rate": 9.966546331768192e-07, "loss": 0.0020832683891057967, "reward": 1.575, "reward_std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.03250000104308128, "rewards/MazeReward/std": 0.07358299195766449, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.4, "completions/mean_length": 105.240625, "completions/min_length": 39.0, "epoch": 1.4957264957264957, "frac_reward_zero_std": 0.975, "grad_norm": 0.48721820593956633, "kl": 0.04866266236640513, "learning_rate": 9.961602898685223e-07, "loss": 0.0019463833421468734, "reward": 1.74375, "reward_std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04937500096857548, "rewards/MazeReward/std": 0.08451755046844482, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.8, "completions/mean_length": 116.1109375, "completions/min_length": 42.0, "epoch": 1.5384615384615383, "frac_reward_zero_std": 0.925, "grad_norm": 0.46120689224078665, "kl": 0.04409475696738809, "learning_rate": 9.956320346634875e-07, "loss": 0.0017639096826314927, "reward": 1.7826171875, "reward_std": 0.0430610993411392, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.053437501564621924, "rewards/MazeReward/std": 0.08518625646829606, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.6, "completions/mean_length": 130.6703125, "completions/min_length": 44.0, "epoch": 1.5811965811965814, "frac_reward_zero_std": 0.9625, "grad_norm": 0.5148105099027588, "kl": 0.045597366779111324, "learning_rate": 9.95069903667256e-07, "loss": 0.0018238790333271026, "reward": 1.596875, "reward_std": 0.031984337419271466, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.03468750007450581, "rewards/MazeReward/std": 0.07464145123958588, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 127.4046875, "completions/min_length": 42.6, "epoch": 1.623931623931624, "frac_reward_zero_std": 0.95, "grad_norm": 0.5138975947309226, "kl": 0.04191551350522786, "learning_rate": 9.944739353007341e-07, "loss": 0.0016768455505371093, "reward": 1.7375, "reward_std": 0.03535533845424652, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.048750000447034834, "rewards/MazeReward/std": 0.0851988285779953, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.6, "completions/mean_length": 117.6421875, "completions/min_length": 41.0, "epoch": 1.6666666666666665, "frac_reward_zero_std": 0.95, "grad_norm": 0.9067737103191192, "kl": 0.03926404060330242, "learning_rate": 9.938441702975689e-07, "loss": 0.0015711262822151184, "reward": 1.634375, "reward_std": 0.03808925524353981, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.038437500968575476, "rewards/MazeReward/std": 0.07839376032352448, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.2, "completions/mean_length": 110.8484375, "completions/min_length": 40.2, "epoch": 1.7094017094017095, "frac_reward_zero_std": 0.975, "grad_norm": 0.09424510883121427, "kl": 0.041527598001994195, "learning_rate": 9.931806517013612e-07, "loss": 0.0016614319756627083, "reward": 1.840625, "reward_std": 0.02041158601641655, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.059062501043081285, "rewards/MazeReward/std": 0.09025485664606095, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 113.19375, "completions/min_length": 41.6, "epoch": 1.7521367521367521, "frac_reward_zero_std": 0.9875, "grad_norm": 0.677800532639946, "kl": 0.04297648051287979, "learning_rate": 9.924834248627258e-07, "loss": 0.0017192240804433823, "reward": 1.896875, "reward_std": 0.00883883461356163, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.06468750163912773, "rewards/MazeReward/std": 0.09119289517402648, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.2, "completions/mean_length": 121.43125, "completions/min_length": 41.4, "epoch": 1.7948717948717947, "frac_reward_zero_std": 0.9375, "grad_norm": 0.09625960474724543, "kl": 0.036925971927121284, "learning_rate": 9.917525374361911e-07, "loss": 0.0014772934839129448, "reward": 1.5576171875, "reward_std": 0.05305119827389717, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.0309375012293458, "rewards/MazeReward/std": 0.059112554788589476, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.8, "completions/mean_length": 115.7890625, "completions/min_length": 40.2, "epoch": 1.8376068376068377, "frac_reward_zero_std": 0.9625, "grad_norm": 0.8859900164071041, "kl": 0.03985867821611464, "learning_rate": 9.909880393769418e-07, "loss": 0.0015943828970193864, "reward": 1.715625, "reward_std": 0.02651650384068489, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04656250104308128, "rewards/MazeReward/std": 0.08200520724058151, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.8, "completions/mean_length": 126.36875, "completions/min_length": 42.0, "epoch": 1.8803418803418803, "frac_reward_zero_std": 0.95, "grad_norm": 0.3847694911433562, "kl": 0.03536163377575576, "learning_rate": 9.901899829374047e-07, "loss": 0.0014145723544061184, "reward": 1.5826171875, "reward_std": 0.04306109994649887, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.03343750163912773, "rewards/MazeReward/std": 0.06289278417825699, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/mean_length": 129.515625, "completions/min_length": 46.0, "epoch": 1.9230769230769231, "frac_reward_zero_std": 0.975, "grad_norm": 0.10814792355364673, "kl": 0.03294098875485361, "learning_rate": 9.893584226636772e-07, "loss": 0.0013176266103982926, "reward": 1.684375, "reward_std": 0.022201896458864213, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04343750029802322, "rewards/MazeReward/std": 0.08095956891775132, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.4, "completions/mean_length": 131.4390625, "completions/min_length": 47.0, "epoch": 1.965811965811966, "frac_reward_zero_std": 0.9875, "grad_norm": 0.0733353932479021, "kl": 0.029036648059263825, "learning_rate": 9.884934153917996e-07, "loss": 0.001161702163517475, "reward": 1.571875, "reward_std": 0.00883883461356163, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.03218750059604645, "rewards/MazeReward/std": 0.07254017442464829, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.6, "completions/mean_length": 114.7703125, "completions/min_length": 45.8, "epoch": 2.0085470085470085, "frac_reward_zero_std": 0.9625, "grad_norm": 0.09827642734369745, "kl": 0.030789492116309703, "learning_rate": 9.8759502024387e-07, "loss": 0.0012318025343120097, "reward": 1.759375, "reward_std": 0.03061639815568924, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05093750227242708, "rewards/MazeReward/std": 0.08228155076503754, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.4, "completions/mean_length": 113.1171875, "completions/min_length": 46.0, "epoch": 2.051282051282051, "frac_reward_zero_std": 0.975, "grad_norm": 0.12057015042239135, "kl": 0.03512264594901353, "learning_rate": 9.866632986240029e-07, "loss": 0.0014049587771296501, "reward": 1.7625, "reward_std": 0.023145502805709837, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05125000160187483, "rewards/MazeReward/std": 0.07912175357341766, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.8, "completions/mean_length": 133.7921875, "completions/min_length": 44.0, "epoch": 2.094017094017094, "frac_reward_zero_std": 0.975, "grad_norm": 0.0906374129160372, "kl": 0.030867059994488955, "learning_rate": 9.856983142141337e-07, "loss": 0.001234784722328186, "reward": 1.66875, "reward_std": 0.03104073107242584, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.041875001043081284, "rewards/MazeReward/std": 0.07904350236058236, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/mean_length": 123.0875, "completions/min_length": 45.4, "epoch": 2.1367521367521367, "frac_reward_zero_std": 0.95, "grad_norm": 0.09514702459075781, "kl": 0.03039407222531736, "learning_rate": 9.847001329696652e-07, "loss": 0.0012160670943558217, "reward": 1.61875, "reward_std": 0.04671337753534317, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.036875000596046446, "rewards/MazeReward/std": 0.07626682817935944, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.6, "completions/mean_length": 121.2296875, "completions/min_length": 40.0, "epoch": 2.1794871794871793, "frac_reward_zero_std": 0.975, "grad_norm": 0.0658722574421842, "kl": 0.03329704308416694, "learning_rate": 9.836688231149592e-07, "loss": 0.0013320941478013992, "reward": 1.890625, "reward_std": 0.02041158601641655, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.06406250149011612, "rewards/MazeReward/std": 0.09125813692808152, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/mean_length": 135.5328125, "completions/min_length": 45.2, "epoch": 2.2222222222222223, "frac_reward_zero_std": 0.9125, "grad_norm": 1.136778575046199, "kl": 0.031809128040913494, "learning_rate": 9.826044551386742e-07, "loss": 0.0012724403291940688, "reward": 1.8025390625, "reward_std": 0.051319288462400435, "rewards/Format/mean": 0.2494140625, "rewards/Format/std": 0.003797071799635887, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490137964487076, "rewards/MazeReward/mean": 0.05562500096857548, "rewards/MazeReward/std": 0.08506897389888764, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.4, "completions/mean_length": 129.78125, "completions/min_length": 42.6, "epoch": 2.264957264957265, "frac_reward_zero_std": 0.95, "grad_norm": 0.09204891341686909, "kl": 0.03131271551828831, "learning_rate": 9.81507101788948e-07, "loss": 0.0012527533806860446, "reward": 1.5125, "reward_std": 0.03535533845424652, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.026249999925494195, "rewards/MazeReward/std": 0.057460378110408786, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.8, "completions/mean_length": 115.8, "completions/min_length": 41.6, "epoch": 2.3076923076923075, "frac_reward_zero_std": 0.9875, "grad_norm": 0.10430657024771325, "kl": 0.033433203084859996, "learning_rate": 9.803768380684242e-07, "loss": 0.00133742094039917, "reward": 1.646875, "reward_std": 0.00883883461356163, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.039687500521540645, "rewards/MazeReward/std": 0.07787278592586518, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.6, "completions/mean_length": 119.921875, "completions/min_length": 41.2, "epoch": 2.3504273504273505, "frac_reward_zero_std": 0.9625, "grad_norm": 0.6038584494068286, "kl": 0.027464703540317714, "learning_rate": 9.792137412291263e-07, "loss": 0.001098698191344738, "reward": 1.5853515625, "reward_std": 0.01943976073525846, "rewards/Format/mean": 0.2494140625, "rewards/Format/std": 0.0066291259601712225, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.03375000134110451, "rewards/MazeReward/std": 0.06722360253334045, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 109.83125, "completions/min_length": 39.0, "epoch": 2.393162393162393, "frac_reward_zero_std": 0.9625, "grad_norm": 0.11135024211508726, "kl": 0.029570043087005615, "learning_rate": 9.780178907671788e-07, "loss": 0.001182807795703411, "reward": 1.528125, "reward_std": 0.03061639815568924, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.027812500298023225, "rewards/MazeReward/std": 0.06877715289592742, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.2, "completions/mean_length": 105.146875, "completions/min_length": 44.0, "epoch": 2.435897435897436, "frac_reward_zero_std": 0.9875, "grad_norm": 0.08211805142370257, "kl": 0.03102337378077209, "learning_rate": 9.76789368417372e-07, "loss": 0.001240898482501507, "reward": 1.66875, "reward_std": 0.011572751402854919, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.041875001043081284, "rewards/MazeReward/std": 0.08127498030662536, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.2, "completions/mean_length": 116.95, "completions/min_length": 44.0, "epoch": 2.4786324786324787, "frac_reward_zero_std": 0.9875, "grad_norm": 0.08713728279299042, "kl": 0.02888093756046146, "learning_rate": 9.755282581475767e-07, "loss": 0.0011553137563169002, "reward": 1.5734375, "reward_std": 0.004419417306780815, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.032500000670552254, "rewards/MazeReward/std": 0.07290461361408233, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.6, "completions/mean_length": 117.6546875, "completions/min_length": 43.0, "epoch": 2.5213675213675213, "frac_reward_zero_std": 0.9375, "grad_norm": 0.6225014244879482, "kl": 0.030179329228121787, "learning_rate": 9.742346461530047e-07, "loss": 0.001207088492810726, "reward": 1.7078125, "reward_std": 0.048613591492176055, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.045937500335276125, "rewards/MazeReward/std": 0.07529200837016106, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.2, "completions/mean_length": 108.06875, "completions/min_length": 44.0, "epoch": 2.564102564102564, "frac_reward_zero_std": 0.9875, "grad_norm": 0.10214096308893882, "kl": 0.030370077083352952, "learning_rate": 9.729086208503173e-07, "loss": 0.0012148864567279815, "reward": 1.646875, "reward_std": 0.00883883461356163, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.039687500521540645, "rewards/MazeReward/std": 0.07787278592586518, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.6, "completions/mean_length": 111.646875, "completions/min_length": 42.2, "epoch": 2.606837606837607, "frac_reward_zero_std": 0.95, "grad_norm": 0.07743790574000314, "kl": 0.026924411568325014, "learning_rate": 9.715502728715825e-07, "loss": 0.0010770590975880622, "reward": 1.6828125, "reward_std": 0.03503581583499908, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.04343750029802322, "rewards/MazeReward/std": 0.08254078775644302, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.2, "completions/mean_length": 109.6625, "completions/min_length": 41.0, "epoch": 2.6495726495726495, "frac_reward_zero_std": 0.95, "grad_norm": 0.5994910349443677, "kl": 0.031393040483817455, "learning_rate": 9.701596950580807e-07, "loss": 0.0012557756155729294, "reward": 1.8076171875, "reward_std": 0.0355882428586483, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.05593749955296516, "rewards/MazeReward/std": 0.08937290459871292, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.2, "completions/mean_length": 103.4078125, "completions/min_length": 44.0, "epoch": 2.6923076923076925, "frac_reward_zero_std": 0.95, "grad_norm": 0.5198429194092566, "kl": 0.031615292513743044, "learning_rate": 9.687369824539576e-07, "loss": 0.0012647857889533042, "reward": 1.659375, "reward_std": 0.03808925524353981, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.040937500074505806, "rewards/MazeReward/std": 0.07862303704023361, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/mean_length": 98.4875, "completions/min_length": 40.2, "epoch": 2.735042735042735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888439306908069, "kl": 0.03628600856754929, "learning_rate": 9.672822322997304e-07, "loss": 0.0014513864181935788, "reward": 1.925, "reward_std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0675000011920929, "rewards/MazeReward/std": 0.09291007369756699, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.2, "completions/mean_length": 123.2421875, "completions/min_length": 42.2, "epoch": 2.7777777777777777, "frac_reward_zero_std": 1.0, "grad_norm": 0.1063652763694058, "kl": 0.030510167125612497, "learning_rate": 9.657955440256395e-07, "loss": 0.001220554392784834, "reward": 1.675, "reward_std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04250000156462193, "rewards/MazeReward/std": 0.07974326312541961, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.4, "completions/mean_length": 141.16875, "completions/min_length": 47.0, "epoch": 2.8205128205128203, "frac_reward_zero_std": 0.9375, "grad_norm": 0.06943368706988594, "kl": 0.029479713435284792, "learning_rate": 9.642770192448535e-07, "loss": 0.0011792988516390324, "reward": 1.70625, "reward_std": 0.06123279705643654, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04562500063329935, "rewards/MazeReward/std": 0.07920214980840683, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.4, "completions/mean_length": 131.9328125, "completions/min_length": 43.2, "epoch": 2.8632478632478633, "frac_reward_zero_std": 0.9875, "grad_norm": 0.07015473179679628, "kl": 0.03236345420591533, "learning_rate": 9.627267617465243e-07, "loss": 0.0012946173548698426, "reward": 1.671875, "reward_std": 0.00883883461356163, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04218750037252903, "rewards/MazeReward/std": 0.08093532919883728, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.4, "completions/mean_length": 114.1890625, "completions/min_length": 41.2, "epoch": 2.905982905982906, "frac_reward_zero_std": 0.975, "grad_norm": 0.0776687533387394, "kl": 0.03055897105950862, "learning_rate": 9.611448774886923e-07, "loss": 0.00122256800532341, "reward": 1.64375, "reward_std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.03937500044703483, "rewards/MazeReward/std": 0.07964256107807159, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.6, "completions/mean_length": 115.2296875, "completions/min_length": 43.8, "epoch": 2.948717948717949, "frac_reward_zero_std": 0.9625, "grad_norm": 0.5003867330049324, "kl": 0.03254980493802577, "learning_rate": 9.595314745910455e-07, "loss": 0.0013021795079112054, "reward": 1.7, "reward_std": 0.03535533845424652, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.045000001601874826, "rewards/MazeReward/std": 0.07816829383373261, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.6, "completions/mean_length": 126.540625, "completions/min_length": 42.8, "epoch": 2.9914529914529915, "frac_reward_zero_std": 0.9625, "grad_norm": 0.06887739934107479, "kl": 0.028944591525942087, "learning_rate": 9.578866633275286e-07, "loss": 0.0011580833233892918, "reward": 1.7109375, "reward_std": 0.03366983756422996, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.04625000152736902, "rewards/MazeReward/std": 0.0803494080901146, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/mean_length": 135.721875, "completions/min_length": 46.0, "epoch": 3.034188034188034, "frac_reward_zero_std": 0.9375, "grad_norm": 0.41790464887707834, "kl": 0.028804128093179317, "learning_rate": 9.562105561188068e-07, "loss": 0.001152261160314083, "reward": 1.5138671875, "reward_std": 0.04716099426150322, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.02656250037252903, "rewards/MazeReward/std": 0.06563240215182305, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.8, "completions/mean_length": 120.253125, "completions/min_length": 45.0, "epoch": 3.076923076923077, "frac_reward_zero_std": 0.9625, "grad_norm": 0.6825711537810063, "kl": 0.025093021499924362, "learning_rate": 9.545032675245813e-07, "loss": 0.001003839448094368, "reward": 1.6169921875, "reward_std": 0.02264951393008232, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.036875000782310964, "rewards/MazeReward/std": 0.07546763122081757, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.6, "completions/mean_length": 112.9390625, "completions/min_length": 42.0, "epoch": 3.1196581196581197, "frac_reward_zero_std": 1.0, "grad_norm": 0.07988617640833492, "kl": 0.028438647370785476, "learning_rate": 9.527649142357594e-07, "loss": 0.0011375134810805321, "reward": 1.725, "reward_std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.047500000521540645, "rewards/MazeReward/std": 0.08356983661651611, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.2, "completions/mean_length": 117.5203125, "completions/min_length": 45.0, "epoch": 3.1623931623931623, "frac_reward_zero_std": 0.95, "grad_norm": 0.6277492109642122, "kl": 0.030780851130839438, "learning_rate": 9.509956150664795e-07, "loss": 0.0012312138453125954, "reward": 1.68125, "reward_std": 0.05303300693631172, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04312500096857548, "rewards/MazeReward/std": 0.08240270167589188, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 680.4, "completions/mean_length": 119.6640625, "completions/min_length": 45.0, "epoch": 3.2051282051282053, "frac_reward_zero_std": 0.8375, "grad_norm": 0.5032696947783828, "kl": 0.02494008478242904, "learning_rate": 9.491954909459894e-07, "loss": 0.0009977094829082488, "reward": 1.8482421875, "reward_std": 0.15122394859790803, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.060000001639127734, "rewards/MazeReward/std": 0.09219729751348496, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.8, "completions/mean_length": 106.1796875, "completions/min_length": 42.6, "epoch": 3.247863247863248, "frac_reward_zero_std": 0.925, "grad_norm": 0.14253494037926903, "kl": 0.02694298147689551, "learning_rate": 9.473646649103817e-07, "loss": 0.0010777967981994152, "reward": 1.675, "reward_std": 0.05713290050625801, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.042499999329447744, "rewards/MazeReward/std": 0.07458726465702056, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.4, "completions/mean_length": 98.975, "completions/min_length": 42.2, "epoch": 3.2905982905982905, "frac_reward_zero_std": 0.95, "grad_norm": 0.3666414752240412, "kl": 0.03238045631442219, "learning_rate": 9.455032620941839e-07, "loss": 0.0012950697913765908, "reward": 1.673046875, "reward_std": 0.03996608294546604, "rewards/Format/mean": 0.249609375, "rewards/Format/std": 0.004419417306780815, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.042500000819563864, "rewards/MazeReward/std": 0.081744384765625, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/mean_length": 99.20625, "completions/min_length": 42.4, "epoch": 3.3333333333333335, "frac_reward_zero_std": 0.95, "grad_norm": 0.11241272737221532, "kl": 0.03298132244963199, "learning_rate": 9.436114097218058e-07, "loss": 0.0013192273676395417, "reward": 1.771875, "reward_std": 0.05576692372560501, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05218750089406967, "rewards/MazeReward/std": 0.08882526755332946, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.2, "completions/mean_length": 109.65, "completions/min_length": 42.0, "epoch": 3.376068376068376, "frac_reward_zero_std": 0.95, "grad_norm": 0.4786503603918001, "kl": 0.030337114841677247, "learning_rate": 9.416892370988442e-07, "loss": 0.0012138230726122857, "reward": 1.6140625, "reward_std": 0.030935921147465704, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.03656250089406967, "rewards/MazeReward/std": 0.07630382627248763, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/mean_length": 118.1140625, "completions/min_length": 42.8, "epoch": 3.4188034188034186, "frac_reward_zero_std": 0.925, "grad_norm": 0.4470622397798485, "kl": 0.02863241416634992, "learning_rate": 9.397368756032444e-07, "loss": 0.0011456131003797054, "reward": 1.621875, "reward_std": 0.09722717925906181, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.037187499552965166, "rewards/MazeReward/std": 0.08185895383358002, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.6, "completions/mean_length": 127.16875, "completions/min_length": 46.8, "epoch": 3.4615384615384617, "frac_reward_zero_std": 0.9125, "grad_norm": 0.4108630055803702, "kl": 0.03009375943802297, "learning_rate": 9.377544586763214e-07, "loss": 0.0012037239968776703, "reward": 1.746875, "reward_std": 0.06733967810869217, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.049687501043081284, "rewards/MazeReward/std": 0.08626897037029266, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.6, "completions/mean_length": 117.2375, "completions/min_length": 41.0, "epoch": 3.5042735042735043, "frac_reward_zero_std": 0.9375, "grad_norm": 0.48309240343203796, "kl": 0.03096096939407289, "learning_rate": 9.357421218136386e-07, "loss": 0.0012385781854391098, "reward": 1.659375, "reward_std": 0.067339675873518, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04093750081956386, "rewards/MazeReward/std": 0.08117270022630692, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.6, "completions/mean_length": 114.11875, "completions/min_length": 42.6, "epoch": 3.547008547008547, "frac_reward_zero_std": 0.95, "grad_norm": 0.07792718733846904, "kl": 0.03464255495928228, "learning_rate": 9.337000025557476e-07, "loss": 0.0013860519975423813, "reward": 1.80625, "reward_std": 0.05303300693631172, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.055624999850988385, "rewards/MazeReward/std": 0.08941979110240936, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.6, "completions/mean_length": 130.7765625, "completions/min_length": 44.2, "epoch": 3.58974358974359, "frac_reward_zero_std": 0.9875, "grad_norm": 0.0701209753535078, "kl": 0.03212045237887651, "learning_rate": 9.316282404787869e-07, "loss": 0.0012850278988480567, "reward": 1.721875, "reward_std": 0.00883883461356163, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04718750026077032, "rewards/MazeReward/std": 0.07888221591711045, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.8, "completions/mean_length": 138.6421875, "completions/min_length": 42.2, "epoch": 3.6324786324786325, "frac_reward_zero_std": 1.0, "grad_norm": 0.06630152308360243, "kl": 0.03000037738820538, "learning_rate": 9.295269771849425e-07, "loss": 0.001200066413730383, "reward": 1.55, "reward_std": 0.0, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.030000000447034835, "rewards/MazeReward/std": 0.06700892895460128, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.4, "completions/mean_length": 133.49375, "completions/min_length": 41.2, "epoch": 3.6752136752136755, "frac_reward_zero_std": 0.95, "grad_norm": 0.6490659454594846, "kl": 0.027061588026117533, "learning_rate": 9.273963562927694e-07, "loss": 0.0010827738791704179, "reward": 1.71875, "reward_std": 0.04692808985710144, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04687500018626452, "rewards/MazeReward/std": 0.08021234273910523, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.6, "completions/mean_length": 129.9046875, "completions/min_length": 46.2, "epoch": 3.717948717948718, "frac_reward_zero_std": 0.95, "grad_norm": 0.06958012670951508, "kl": 0.024378333985805512, "learning_rate": 9.252365234273753e-07, "loss": 0.0009752914309501648, "reward": 1.653125, "reward_std": 0.04419417306780815, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.040312501788139346, "rewards/MazeReward/std": 0.07667017579078675, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.4, "completions/mean_length": 125.5953125, "completions/min_length": 40.6, "epoch": 3.7606837606837606, "frac_reward_zero_std": 0.9375, "grad_norm": 0.4209501303182819, "kl": 0.027644472965039312, "learning_rate": 9.230476262104676e-07, "loss": 0.001106039434671402, "reward": 1.778125, "reward_std": 0.09722718000411987, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.052812499925494194, "rewards/MazeReward/std": 0.09043156951665879, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.2, "completions/mean_length": 123.809375, "completions/min_length": 40.6, "epoch": 3.8034188034188032, "frac_reward_zero_std": 0.9125, "grad_norm": 0.6137811629084503, "kl": 0.032928169379010795, "learning_rate": 9.208298142502635e-07, "loss": 0.0013174701482057572, "reward": 1.853125, "reward_std": 0.1190047413110733, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.060312502831220624, "rewards/MazeReward/std": 0.09812327623367309, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1026.6, "completions/mean_length": 116.4984375, "completions/min_length": 39.2, "epoch": 3.8461538461538463, "frac_reward_zero_std": 0.9375, "grad_norm": 0.11178824029006519, "kl": 0.030316670378670096, "learning_rate": 9.185832391312642e-07, "loss": 0.0012127190828323364, "reward": 1.6234375, "reward_std": 0.03977475576102733, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257904887199402, "rewards/MazeReward/mean": 0.03781250044703484, "rewards/MazeReward/std": 0.07775698155164719, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.8, "completions/mean_length": 105.140625, "completions/min_length": 38.6, "epoch": 3.888888888888889, "frac_reward_zero_std": 0.9875, "grad_norm": 0.11069044326468976, "kl": 0.032291453005746006, "learning_rate": 9.163080544038952e-07, "loss": 0.0012919959612190724, "reward": 1.68125, "reward_std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.043125000037252906, "rewards/MazeReward/std": 0.07953909039497375, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.8, "completions/mean_length": 118.80625, "completions/min_length": 41.6, "epoch": 3.931623931623932, "frac_reward_zero_std": 0.95, "grad_norm": 0.07418346654460156, "kl": 0.027222537877969445, "learning_rate": 9.1400441557401e-07, "loss": 0.0010891311801970005, "reward": 1.684375, "reward_std": 0.06187183856964111, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04343750141561031, "rewards/MazeReward/std": 0.08347698003053665, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.4, "completions/mean_length": 124.1578125, "completions/min_length": 42.2, "epoch": 3.9743589743589745, "frac_reward_zero_std": 0.9375, "grad_norm": 0.4513961725261483, "kl": 0.02881625925656408, "learning_rate": 9.116724800922629e-07, "loss": 0.001152960304170847, "reward": 1.671875, "reward_std": 0.06733967512845992, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0421875, "rewards/MazeReward/std": 0.0824856549501419, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.6, "completions/mean_length": 127.49375, "completions/min_length": 42.2, "epoch": 4.017094017094017, "frac_reward_zero_std": 0.9875, "grad_norm": 0.07745948442852763, "kl": 0.029386252618860454, "learning_rate": 9.093124073433462e-07, "loss": 0.0011756937950849534, "reward": 1.75625, "reward_std": 0.01767766922712326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05062500163912773, "rewards/MazeReward/std": 0.08348544836044311, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.8, "completions/mean_length": 132.2203125, "completions/min_length": 45.6, "epoch": 4.05982905982906, "frac_reward_zero_std": 0.9625, "grad_norm": 0.49465611720279273, "kl": 0.023052448022644965, "learning_rate": 9.069243586350975e-07, "loss": 0.0009222443215548993, "reward": 1.659375, "reward_std": 0.04419417306780815, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.040937500447034834, "rewards/MazeReward/std": 0.08035238832235336, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.4, "completions/mean_length": 139.25625, "completions/min_length": 46.4, "epoch": 4.102564102564102, "frac_reward_zero_std": 0.9625, "grad_norm": 0.3804415181399902, "kl": 0.023549946118146182, "learning_rate": 9.045084971874737e-07, "loss": 0.0009421935304999351, "reward": 1.7873046875, "reward_std": 0.03590776561759412, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05375000089406967, "rewards/MazeReward/std": 0.08982028812170029, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.2, "completions/mean_length": 131.678125, "completions/min_length": 42.8, "epoch": 4.145299145299146, "frac_reward_zero_std": 0.925, "grad_norm": 0.057119814736412144, "kl": 0.023166414664592593, "learning_rate": 9.020649881213958e-07, "loss": 0.0009268797934055328, "reward": 1.7421875, "reward_std": 0.07312507033348084, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.049375002086162564, "rewards/MazeReward/std": 0.0843595564365387, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/mean_length": 118.6171875, "completions/min_length": 43.2, "epoch": 4.188034188034188, "frac_reward_zero_std": 0.975, "grad_norm": 0.06585259973199104, "kl": 0.027671133645344525, "learning_rate": 8.995939984474623e-07, "loss": 0.0011070730164647103, "reward": 1.765625, "reward_std": 0.044194172322750094, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0515625, "rewards/MazeReward/std": 0.0891284242272377, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.6, "completions/mean_length": 133.1890625, "completions/min_length": 47.0, "epoch": 4.230769230769231, "frac_reward_zero_std": 0.95, "grad_norm": 0.07195953683323365, "kl": 0.026406785706058145, "learning_rate": 8.970956970545355e-07, "loss": 0.0010565707460045814, "reward": 1.653125, "reward_std": 0.07954951077699661, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04031250141561031, "rewards/MazeReward/std": 0.08194140195846558, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.6, "completions/mean_length": 145.796875, "completions/min_length": 46.4, "epoch": 4.273504273504273, "frac_reward_zero_std": 0.9, "grad_norm": 0.7781681320743019, "kl": 0.03257816187106073, "learning_rate": 8.945702546981968e-07, "loss": 0.0013033507391810417, "reward": 1.8734375, "reward_std": 0.12816310077905654, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.06250000223517418, "rewards/MazeReward/std": 0.09654748886823654, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.8, "completions/mean_length": 137.4609375, "completions/min_length": 46.6, "epoch": 4.316239316239316, "frac_reward_zero_std": 0.9, "grad_norm": 0.3578918222768929, "kl": 0.03791391234844923, "learning_rate": 8.920178439890764e-07, "loss": 0.0015162624418735503, "reward": 1.68125, "reward_std": 0.15235702246427535, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.043125000596046445, "rewards/MazeReward/std": 0.08651673793792725, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/mean_length": 133.7265625, "completions/min_length": 46.4, "epoch": 4.358974358974359, "frac_reward_zero_std": 0.95, "grad_norm": 0.09537837140559144, "kl": 0.05111516213510185, "learning_rate": 8.894386393810562e-07, "loss": 0.002045181207358837, "reward": 1.75, "reward_std": 0.08984613418579102, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05000000149011612, "rewards/MazeReward/std": 0.09007189571857452, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.6, "completions/mean_length": 132.690625, "completions/min_length": 46.0, "epoch": 4.401709401709402, "frac_reward_zero_std": 0.925, "grad_norm": 0.4824559224968059, "kl": 0.04020680082030594, "learning_rate": 8.868328171593446e-07, "loss": 0.0016084747388958932, "reward": 1.671875, "reward_std": 0.12037268280982971, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04218750037252903, "rewards/MazeReward/std": 0.08702817857265473, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.6, "completions/mean_length": 146.3796875, "completions/min_length": 45.8, "epoch": 4.444444444444445, "frac_reward_zero_std": 0.8375, "grad_norm": 0.8577094695092758, "kl": 0.032066563097760084, "learning_rate": 8.842005554284295e-07, "loss": 0.0012827066704630852, "reward": 1.6953125, "reward_std": 0.22211109101772308, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.044687502458691596, "rewards/MazeReward/std": 0.09304451793432236, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.2, "completions/mean_length": 153.4734375, "completions/min_length": 52.4, "epoch": 4.487179487179487, "frac_reward_zero_std": 0.9375, "grad_norm": 0.10337241127092116, "kl": 0.03314277136232704, "learning_rate": 8.815420340999033e-07, "loss": 0.0013256728649139403, "reward": 1.803125, "reward_std": 0.13762091994285583, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05531249977648258, "rewards/MazeReward/std": 0.09847460389137268, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.6, "completions/mean_length": 158.69375, "completions/min_length": 48.2, "epoch": 4.52991452991453, "frac_reward_zero_std": 0.875, "grad_norm": 0.7822120163109842, "kl": 0.03773373905569315, "learning_rate": 8.788574348801674e-07, "loss": 0.0015093881636857986, "reward": 1.834375, "reward_std": 0.189805269241333, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05843750089406967, "rewards/MazeReward/std": 0.0991519644856453, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/mean_length": 148.371875, "completions/min_length": 44.8, "epoch": 4.572649572649572, "frac_reward_zero_std": 0.8875, "grad_norm": 0.7805650338633506, "kl": 0.044839829625561836, "learning_rate": 8.761469412580124e-07, "loss": 0.0017939582467079163, "reward": 1.821875, "reward_std": 0.19949472695589066, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05718750134110451, "rewards/MazeReward/std": 0.10765503495931625, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.2, "completions/mean_length": 141.103125, "completions/min_length": 48.0, "epoch": 4.615384615384615, "frac_reward_zero_std": 0.8875, "grad_norm": 0.4920259806333268, "kl": 0.0542911626631394, "learning_rate": 8.734107384920769e-07, "loss": 0.002171722613275051, "reward": 1.728125, "reward_std": 0.20894072502851485, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.04781249910593033, "rewards/MazeReward/std": 0.09794875681400299, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.6, "completions/mean_length": 121.7015625, "completions/min_length": 46.0, "epoch": 4.6581196581196584, "frac_reward_zero_std": 0.8875, "grad_norm": 0.09997289593206347, "kl": 0.04621433573774993, "learning_rate": 8.706490135981855e-07, "loss": 0.0018487025052309036, "reward": 1.859375, "reward_std": 0.1668713480234146, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.06093750074505806, "rewards/MazeReward/std": 0.10001743435859681, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.2, "completions/mean_length": 130.375, "completions/min_length": 47.4, "epoch": 4.700854700854701, "frac_reward_zero_std": 0.8375, "grad_norm": 1.1045318983420604, "kl": 0.0453646298032254, "learning_rate": 8.678619553365658e-07, "loss": 0.0018147587776184081, "reward": 1.76875, "reward_std": 0.2367353230714798, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.051874999329447745, "rewards/MazeReward/std": 0.0953328013420105, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/mean_length": 130.9046875, "completions/min_length": 45.4, "epoch": 4.743589743589744, "frac_reward_zero_std": 0.8125, "grad_norm": 0.7177873849402897, "kl": 0.06714675026014447, "learning_rate": 8.650497541989481e-07, "loss": 0.002685732953250408, "reward": 1.796875, "reward_std": 0.2886998623609543, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.05468749850988388, "rewards/MazeReward/std": 0.1036534622311592, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/mean_length": 128.7328125, "completions/min_length": 45.4, "epoch": 4.786324786324786, "frac_reward_zero_std": 0.725, "grad_norm": 1.4582789306751023, "kl": 0.07768260380253196, "learning_rate": 8.622126023955445e-07, "loss": 0.0031074721366167067, "reward": 2.04375, "reward_std": 0.49395993947982786, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.07937500178813935, "rewards/MazeReward/std": 0.13558758199214935, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.8, "completions/mean_length": 126.7515625, "completions/min_length": 45.2, "epoch": 4.829059829059829, "frac_reward_zero_std": 0.8625, "grad_norm": 0.5188677978439834, "kl": 0.09420239464379847, "learning_rate": 8.593506938419119e-07, "loss": 0.0037682272493839266, "reward": 1.940625, "reward_std": 0.25565096735954285, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0690625011920929, "rewards/MazeReward/std": 0.11698136031627655, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.2, "completions/mean_length": 124.4171875, "completions/min_length": 45.4, "epoch": 4.871794871794872, "frac_reward_zero_std": 0.7625, "grad_norm": 0.9088687585485157, "kl": 0.09596180045045913, "learning_rate": 8.564642241456986e-07, "loss": 0.0038392230868339538, "reward": 2.059375, "reward_std": 0.38991106748580934, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.08093750327825547, "rewards/MazeReward/std": 0.13103083670139312, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.8, "completions/mean_length": 119.2703125, "completions/min_length": 47.2, "epoch": 4.914529914529915, "frac_reward_zero_std": 0.825, "grad_norm": 0.9208085900960126, "kl": 0.10018278225325047, "learning_rate": 8.535533905932737e-07, "loss": 0.004007264971733093, "reward": 1.965625, "reward_std": 0.2675593763589859, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0715625025331974, "rewards/MazeReward/std": 0.12371677309274673, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.8, "completions/mean_length": 115.0890625, "completions/min_length": 44.0, "epoch": 4.957264957264957, "frac_reward_zero_std": 0.825, "grad_norm": 0.6640474967383764, "kl": 0.10094724758528173, "learning_rate": 8.506183921362442e-07, "loss": 0.0040367752313613895, "reward": 2.1015625, "reward_std": 0.29669988751411436, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.08531250059604645, "rewards/MazeReward/std": 0.12427427470684052, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.4, "completions/mean_length": 136.65625, "completions/min_length": 51.8, "epoch": 5.0, "frac_reward_zero_std": 0.775, "grad_norm": 1.1663694130460183, "kl": 0.09316067076288163, "learning_rate": 8.47659429377856e-07, "loss": 0.0037270143628120424, "reward": 2.028125, "reward_std": 0.45681936144828794, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.07781250178813934, "rewards/MazeReward/std": 0.13334924578666688, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 760.4, "completions/mean_length": 131.6875, "completions/min_length": 49.6, "epoch": 5.042735042735043, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6440667033421631, "kl": 0.10447121229954064, "learning_rate": 8.446767045592829e-07, "loss": 0.004178965836763382, "reward": 2.1078125, "reward_std": 0.29008276015520096, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.08593750149011611, "rewards/MazeReward/std": 0.1316636636853218, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.2, "completions/mean_length": 125.784375, "completions/min_length": 48.0, "epoch": 5.085470085470085, "frac_reward_zero_std": 0.8125, "grad_norm": 0.9390368640983007, "kl": 0.10983694661408663, "learning_rate": 8.416704215458042e-07, "loss": 0.004393426328897476, "reward": 2.228125, "reward_std": 0.35290807485580444, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0978125050663948, "rewards/MazeReward/std": 0.14086824804544448, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/mean_length": 126.24375, "completions/min_length": 46.4, "epoch": 5.128205128205128, "frac_reward_zero_std": 0.9, "grad_norm": 0.1506841707882417, "kl": 0.10680326581932605, "learning_rate": 8.386407858128706e-07, "loss": 0.004272110760211945, "reward": 1.928125, "reward_std": 0.20876103043556213, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0678125001490116, "rewards/MazeReward/std": 0.112618388235569, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.8, "completions/mean_length": 124.8765625, "completions/min_length": 49.6, "epoch": 5.170940170940171, "frac_reward_zero_std": 0.825, "grad_norm": 1.0740204866550926, "kl": 0.10116232139989734, "learning_rate": 8.355880044320597e-07, "loss": 0.004046444967389107, "reward": 1.890625, "reward_std": 0.38107420802116393, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.06406250335276127, "rewards/MazeReward/std": 0.1236990287899971, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.6, "completions/mean_length": 134.2015625, "completions/min_length": 47.6, "epoch": 5.213675213675214, "frac_reward_zero_std": 0.7875, "grad_norm": 1.375038482551864, "kl": 0.09841636866331101, "learning_rate": 8.325122860569241e-07, "loss": 0.003936619684100151, "reward": 1.9625, "reward_std": 0.4536560237407684, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.07125000096857548, "rewards/MazeReward/std": 0.1281718507409096, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.4, "completions/mean_length": 131.0296875, "completions/min_length": 46.4, "epoch": 5.256410256410256, "frac_reward_zero_std": 0.85, "grad_norm": 1.1723644368618256, "kl": 0.09129949091002346, "learning_rate": 8.294138409087289e-07, "loss": 0.003652118146419525, "reward": 2.1125, "reward_std": 0.3000807613134384, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.08625000119209289, "rewards/MazeReward/std": 0.12415469884872436, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.8, "completions/mean_length": 131.265625, "completions/min_length": 48.0, "epoch": 5.299145299145299, "frac_reward_zero_std": 0.8625, "grad_norm": 0.9497437788096562, "kl": 0.09446963081136346, "learning_rate": 8.262928807620843e-07, "loss": 0.00377846360206604, "reward": 1.978125, "reward_std": 0.2670121371746063, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.07281250134110451, "rewards/MazeReward/std": 0.12639591097831726, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/mean_length": 135.5703125, "completions/min_length": 48.6, "epoch": 5.3418803418803416, "frac_reward_zero_std": 0.9125, "grad_norm": 0.5190445499016438, "kl": 0.07716012820601463, "learning_rate": 8.231496189304704e-07, "loss": 0.0030866391956806184, "reward": 2.36875, "reward_std": 0.16285933554172516, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11187500059604645, "rewards/MazeReward/std": 0.1347514197230339, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/mean_length": 147.7828125, "completions/min_length": 50.4, "epoch": 5.384615384615385, "frac_reward_zero_std": 0.9375, "grad_norm": 0.09794822319206357, "kl": 0.07791153551079333, "learning_rate": 8.199842702516582e-07, "loss": 0.0031170587986707687, "reward": 1.925, "reward_std": 0.13193328380584718, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0675000011920929, "rewards/MazeReward/std": 0.11543771624565125, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.4, "completions/mean_length": 136.090625, "completions/min_length": 50.0, "epoch": 5.427350427350428, "frac_reward_zero_std": 0.8, "grad_norm": 1.1149140092457694, "kl": 0.08806668547913432, "learning_rate": 8.167970510730252e-07, "loss": 0.003522975742816925, "reward": 2.3625, "reward_std": 0.36211256980895995, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11124999970197677, "rewards/MazeReward/std": 0.14898888319730758, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.2, "completions/mean_length": 118.378125, "completions/min_length": 49.2, "epoch": 5.47008547008547, "frac_reward_zero_std": 0.85, "grad_norm": 0.6659839381335685, "kl": 0.1098570752888918, "learning_rate": 8.135881792367685e-07, "loss": 0.004394949972629547, "reward": 2.228125, "reward_std": 0.26746952831745147, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.09781250357627869, "rewards/MazeReward/std": 0.144122476875782, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/mean_length": 118.1984375, "completions/min_length": 48.8, "epoch": 5.512820512820513, "frac_reward_zero_std": 0.8, "grad_norm": 0.9565892063556133, "kl": 0.10833388594910502, "learning_rate": 8.103578740650156e-07, "loss": 0.0043334424495697025, "reward": 2.096875, "reward_std": 0.3764511190354824, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.08468750044703484, "rewards/MazeReward/std": 0.13841111958026886, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.4, "completions/mean_length": 114.625, "completions/min_length": 49.4, "epoch": 5.555555555555555, "frac_reward_zero_std": 0.825, "grad_norm": 0.8226224361357182, "kl": 0.11840139674022794, "learning_rate": 8.071063563448339e-07, "loss": 0.004735597595572472, "reward": 1.909375, "reward_std": 0.3718319460749626, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.06593750081956387, "rewards/MazeReward/std": 0.13234255015850066, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.4, "completions/mean_length": 107.1859375, "completions/min_length": 43.4, "epoch": 5.598290598290598, "frac_reward_zero_std": 0.7875, "grad_norm": 0.5636374604837406, "kl": 0.12389737367630005, "learning_rate": 8.038338483131406e-07, "loss": 0.0049566149711608885, "reward": 2.240625, "reward_std": 0.4401042401790619, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.09906250238418579, "rewards/MazeReward/std": 0.1511495217680931, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.4, "completions/mean_length": 110.5671875, "completions/min_length": 48.4, "epoch": 5.641025641025641, "frac_reward_zero_std": 0.8, "grad_norm": 0.10616510206581765, "kl": 0.1140824118629098, "learning_rate": 8.005405736415125e-07, "loss": 0.004563612118363381, "reward": 2.18125, "reward_std": 0.4140210926532745, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0931250013411045, "rewards/MazeReward/std": 0.14360718429088593, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.4, "completions/mean_length": 112.2515625, "completions/min_length": 43.4, "epoch": 5.683760683760684, "frac_reward_zero_std": 0.8375, "grad_norm": 1.080309819167414, "kl": 0.10736344018951058, "learning_rate": 7.97226757420899e-07, "loss": 0.004295756667852401, "reward": 2.375, "reward_std": 0.3431886717677116, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11250000596046447, "rewards/MazeReward/std": 0.14831583201885223, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/mean_length": 125.2609375, "completions/min_length": 54.0, "epoch": 5.726495726495727, "frac_reward_zero_std": 0.7875, "grad_norm": 1.0543109341141317, "kl": 0.09406610750593244, "learning_rate": 7.938926261462365e-07, "loss": 0.003763087838888168, "reward": 2.4296875, "reward_std": 0.4040140748023987, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.11812500208616257, "rewards/MazeReward/std": 0.15680089294910432, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 136.3890625, "completions/min_length": 54.8, "epoch": 5.769230769230769, "frac_reward_zero_std": 0.7625, "grad_norm": 1.0450825996397608, "kl": 0.09056174824945629, "learning_rate": 7.905384077009692e-07, "loss": 0.003623136132955551, "reward": 2.3216796875, "reward_std": 0.4457092106342316, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.10718750059604645, "rewards/MazeReward/std": 0.15488833487033843, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.2, "completions/mean_length": 133.19375, "completions/min_length": 53.2, "epoch": 5.811965811965812, "frac_reward_zero_std": 0.6875, "grad_norm": 0.988620318778312, "kl": 0.09170094770379364, "learning_rate": 7.871643313414718e-07, "loss": 0.0036685168743133545, "reward": 2.475, "reward_std": 0.5890827000141143, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/MazeReward/mean": 0.12281250208616257, "rewards/MazeReward/std": 0.16487362384796142, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 739.8, "completions/mean_length": 123.44375, "completions/min_length": 47.8, "epoch": 5.854700854700854, "frac_reward_zero_std": 0.725, "grad_norm": 0.9104290040899861, "kl": 0.11567545076832175, "learning_rate": 7.837706276813818e-07, "loss": 0.004626954346895218, "reward": 2.3546875, "reward_std": 0.6117664694786071, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.11062500178813935, "rewards/MazeReward/std": 0.16645533144474028, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 679.4, "completions/mean_length": 112.5953125, "completions/min_length": 46.4, "epoch": 5.897435897435898, "frac_reward_zero_std": 0.825, "grad_norm": 0.7226788498026553, "kl": 0.12265814091078937, "learning_rate": 7.803575286758363e-07, "loss": 0.004906488955020905, "reward": 2.1091796875, "reward_std": 0.35595683455467225, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/MazeReward/mean": 0.08625000044703483, "rewards/MazeReward/std": 0.14030689746141434, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 640.8, "completions/mean_length": 121.75, "completions/min_length": 48.6, "epoch": 5.94017094017094, "frac_reward_zero_std": 0.7625, "grad_norm": 0.8459505519586844, "kl": 0.12108418410643935, "learning_rate": 7.769252676056186e-07, "loss": 0.004843691736459732, "reward": 2.2890625, "reward_std": 0.5048299908638001, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.10406250357627869, "rewards/MazeReward/std": 0.1560496523976326, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 1695.6, "completions/mean_length": 126.9140625, "completions/min_length": 50.8, "epoch": 5.982905982905983, "frac_reward_zero_std": 0.75, "grad_norm": 1.124400597193716, "kl": 0.13407598659396172, "learning_rate": 7.734740790612136e-07, "loss": 0.005365237221121788, "reward": 2.0466796875, "reward_std": 0.47798594236373904, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.99375, "rewards/MazeFormat/std": 0.07071067690849304, "rewards/MazeReward/mean": 0.08031250461935997, "rewards/MazeReward/std": 0.14358068108558655, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.6, "completions/mean_length": 110.090625, "completions/min_length": 43.6, "epoch": 6.0256410256410255, "frac_reward_zero_std": 0.6875, "grad_norm": 1.8347446426315819, "kl": 0.1349698563106358, "learning_rate": 7.700041989267736e-07, "loss": 0.005399256944656372, "reward": 2.359375, "reward_std": 0.6089624762535095, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11093750447034836, "rewards/MazeReward/std": 0.1633380174636841, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.6, "completions/mean_length": 106.2421875, "completions/min_length": 46.0, "epoch": 6.068376068376068, "frac_reward_zero_std": 0.9125, "grad_norm": 0.7119869882019543, "kl": 0.13576708221808076, "learning_rate": 7.665158643639969e-07, "loss": 0.00543157123029232, "reward": 2.35, "reward_std": 0.17233721613883973, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11000000089406967, "rewards/MazeReward/std": 0.14970978498458862, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/mean_length": 110.7984375, "completions/min_length": 47.2, "epoch": 6.111111111111111, "frac_reward_zero_std": 0.825, "grad_norm": 1.3542356621009914, "kl": 0.1262268964201212, "learning_rate": 7.63009313795917e-07, "loss": 0.005049209296703339, "reward": 2.453125, "reward_std": 0.3907487615942955, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.12031250596046447, "rewards/MazeReward/std": 0.1551662117242813, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1031.0, "completions/mean_length": 122.815625, "completions/min_length": 46.6, "epoch": 6.153846153846154, "frac_reward_zero_std": 0.7375, "grad_norm": 1.0513048424099745, "kl": 0.1123365402687341, "learning_rate": 7.594847868906076e-07, "loss": 0.004493502527475357, "reward": 2.2828125, "reward_std": 0.45589269399642945, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9953125, "rewards/MazeFormat/std": 0.04257904887199402, "rewards/MazeReward/mean": 0.10375000387430192, "rewards/MazeReward/std": 0.1544831484556198, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.4, "completions/mean_length": 112.59375, "completions/min_length": 43.6, "epoch": 6.196581196581197, "frac_reward_zero_std": 0.7375, "grad_norm": 1.4851889198441417, "kl": 0.12141171535477042, "learning_rate": 7.559425245448005e-07, "loss": 0.004856729134917259, "reward": 2.484375, "reward_std": 0.6057861864566803, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.12343750149011612, "rewards/MazeReward/std": 0.17167751789093016, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.8, "completions/mean_length": 107.7328125, "completions/min_length": 44.4, "epoch": 6.239316239316239, "frac_reward_zero_std": 0.7875, "grad_norm": 0.8669441021866194, "kl": 0.1215637393295765, "learning_rate": 7.523827688674219e-07, "loss": 0.0048632964491844176, "reward": 2.409375, "reward_std": 0.45396568775177004, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1159375011920929, "rewards/MazeReward/std": 0.16991185545921325, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.4, "completions/mean_length": 110.7125, "completions/min_length": 42.0, "epoch": 6.282051282051282, "frac_reward_zero_std": 0.7875, "grad_norm": 1.4722002374740226, "kl": 0.1266021172516048, "learning_rate": 7.488057631630437e-07, "loss": 0.0050648033618927, "reward": 2.525, "reward_std": 0.38043124973773956, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1275000035762787, "rewards/MazeReward/std": 0.16860854327678682, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.8, "completions/mean_length": 107.0140625, "completions/min_length": 44.2, "epoch": 6.3247863247863245, "frac_reward_zero_std": 0.9125, "grad_norm": 0.55055024572662, "kl": 0.11993481060490012, "learning_rate": 7.452117519152541e-07, "loss": 0.004797622561454773, "reward": 2.06875, "reward_std": 0.20705547034740449, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.08187500089406967, "rewards/MazeReward/std": 0.13211893141269684, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.6, "completions/mean_length": 105.2875, "completions/min_length": 44.0, "epoch": 6.367521367521368, "frac_reward_zero_std": 0.8625, "grad_norm": 0.5312250136982063, "kl": 0.1269465253688395, "learning_rate": 7.416009807699481e-07, "loss": 0.005077499523758888, "reward": 2.1375, "reward_std": 0.28113911747932435, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.08875000178813934, "rewards/MazeReward/std": 0.14652519375085832, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.2, "completions/mean_length": 108.621875, "completions/min_length": 39.0, "epoch": 6.410256410256411, "frac_reward_zero_std": 0.875, "grad_norm": 0.9992359610718061, "kl": 0.11899666213430464, "learning_rate": 7.379736965185368e-07, "loss": 0.004760226607322693, "reward": 2.08125, "reward_std": 0.27081257551908494, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0831249974668026, "rewards/MazeReward/std": 0.13845863491296767, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.6, "completions/mean_length": 113.45625, "completions/min_length": 44.2, "epoch": 6.452991452991453, "frac_reward_zero_std": 0.8625, "grad_norm": 1.1801694424009033, "kl": 0.11703309016302228, "learning_rate": 7.343301470810807e-07, "loss": 0.004681786894798279, "reward": 2.040625, "reward_std": 0.2878511890769005, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.07906250059604644, "rewards/MazeReward/std": 0.13535543382167817, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 121.95625, "completions/min_length": 44.2, "epoch": 6.495726495726496, "frac_reward_zero_std": 0.7625, "grad_norm": 1.0955414704601263, "kl": 0.1092055644840002, "learning_rate": 7.306705814893439e-07, "loss": 0.004368682950735092, "reward": 2.4, "reward_std": 0.4841845452785492, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11500000506639481, "rewards/MazeReward/std": 0.16087428629398345, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.4, "completions/mean_length": 121.575, "completions/min_length": 39.2, "epoch": 6.538461538461538, "frac_reward_zero_std": 0.8625, "grad_norm": 0.5510477790957099, "kl": 0.11705516274087131, "learning_rate": 7.269952498697734e-07, "loss": 0.004682149365544319, "reward": 2.315625, "reward_std": 0.30975441038608553, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.10656250566244126, "rewards/MazeReward/std": 0.15523205399513246, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.8, "completions/mean_length": 134.95625, "completions/min_length": 46.2, "epoch": 6.581196581196581, "frac_reward_zero_std": 0.85, "grad_norm": 1.2102669313683418, "kl": 0.0996331512928009, "learning_rate": 7.233044034264033e-07, "loss": 0.003985384851694107, "reward": 2.29375, "reward_std": 0.31227283328771593, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.10437500327825547, "rewards/MazeReward/std": 0.15053761154413223, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 706.8, "completions/mean_length": 139.1375, "completions/min_length": 44.4, "epoch": 6.6239316239316235, "frac_reward_zero_std": 0.7625, "grad_norm": 0.7817060796702399, "kl": 0.0973482757806778, "learning_rate": 7.195982944236852e-07, "loss": 0.0038939833641052244, "reward": 2.3921875, "reward_std": 0.529007887840271, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.1143750011920929, "rewards/MazeReward/std": 0.16184994280338288, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.6, "completions/mean_length": 133.196875, "completions/min_length": 43.2, "epoch": 6.666666666666667, "frac_reward_zero_std": 0.8375, "grad_norm": 0.09525023329994642, "kl": 0.09814362931065261, "learning_rate": 7.158771761692464e-07, "loss": 0.003926222771406173, "reward": 2.28125, "reward_std": 0.3239602565765381, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.10312499850988388, "rewards/MazeReward/std": 0.14976677149534226, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 126.6578125, "completions/min_length": 48.4, "epoch": 6.7094017094017095, "frac_reward_zero_std": 0.875, "grad_norm": 0.8633826012698371, "kl": 0.10337876132689416, "learning_rate": 7.121413029965769e-07, "loss": 0.00413544774055481, "reward": 2.353125, "reward_std": 0.2962625741958618, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11031250134110451, "rewards/MazeReward/std": 0.14826812595129013, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.8, "completions/mean_length": 116.65625, "completions/min_length": 47.2, "epoch": 6.752136752136752, "frac_reward_zero_std": 0.9375, "grad_norm": 0.8994361258648217, "kl": 0.11636109538376331, "learning_rate": 7.083909302476452e-07, "loss": 0.004654894769191742, "reward": 2.284375, "reward_std": 0.15024999976158143, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.10343750342726707, "rewards/MazeReward/std": 0.14751739650964737, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.4, "completions/mean_length": 113.7375, "completions/min_length": 44.4, "epoch": 6.794871794871795, "frac_reward_zero_std": 0.8375, "grad_norm": 1.3106559165017158, "kl": 0.12230570400133729, "learning_rate": 7.04626314255447e-07, "loss": 0.004891832917928695, "reward": 2.21875, "reward_std": 0.4313662528991699, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.09687500149011612, "rewards/MazeReward/std": 0.16106954216957092, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.6, "completions/mean_length": 114.8796875, "completions/min_length": 44.0, "epoch": 6.837606837606837, "frac_reward_zero_std": 0.725, "grad_norm": 1.5334334590930974, "kl": 0.1225132972933352, "learning_rate": 7.008477123264847e-07, "loss": 0.004900344088673592, "reward": 2.59375, "reward_std": 0.5986619353294372, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.13437500596046448, "rewards/MazeReward/std": 0.17435938417911528, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/mean_length": 110.9640625, "completions/min_length": 44.0, "epoch": 6.880341880341881, "frac_reward_zero_std": 0.8375, "grad_norm": 0.8435043549951452, "kl": 0.12477834653109313, "learning_rate": 6.970553827231808e-07, "loss": 0.004991311207413673, "reward": 2.4609375, "reward_std": 0.32227787673473357, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.12125000357627869, "rewards/MazeReward/std": 0.1671494722366333, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.2, "completions/mean_length": 109.0140625, "completions/min_length": 47.4, "epoch": 6.923076923076923, "frac_reward_zero_std": 0.925, "grad_norm": 1.0045786602311646, "kl": 0.1167522537522018, "learning_rate": 6.932495846462261e-07, "loss": 0.004670744389295578, "reward": 2.36875, "reward_std": 0.1768634021282196, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1118750050663948, "rewards/MazeReward/std": 0.15320810079574584, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.6, "completions/mean_length": 116.41875, "completions/min_length": 46.2, "epoch": 6.965811965811966, "frac_reward_zero_std": 0.75, "grad_norm": 1.28987641874247, "kl": 0.11954717123880983, "learning_rate": 6.894305782168638e-07, "loss": 0.00478287898004055, "reward": 2.346875, "reward_std": 0.5353424847126007, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.10968749970197678, "rewards/MazeReward/std": 0.1676226884126663, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.8, "completions/mean_length": 116.9609375, "completions/min_length": 43.8, "epoch": 7.0085470085470085, "frac_reward_zero_std": 0.8375, "grad_norm": 0.46566542739911215, "kl": 0.11674826825037599, "learning_rate": 6.855986244591103e-07, "loss": 0.004670187830924988, "reward": 2.2888671875, "reward_std": 0.41036257445812224, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.1040624961256981, "rewards/MazeReward/std": 0.1657874584197998, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.8, "completions/mean_length": 104.7578125, "completions/min_length": 43.6, "epoch": 7.051282051282051, "frac_reward_zero_std": 0.8875, "grad_norm": 0.914032778978338, "kl": 0.13094109743833543, "learning_rate": 6.817539852819148e-07, "loss": 0.00523756854236126, "reward": 2.3, "reward_std": 0.24220315217971802, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.10500000417232513, "rewards/MazeReward/std": 0.15769463479518891, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.4, "completions/mean_length": 106.153125, "completions/min_length": 44.2, "epoch": 7.094017094017094, "frac_reward_zero_std": 0.8625, "grad_norm": 0.9417500607932524, "kl": 0.12604689141735434, "learning_rate": 6.778969234612583e-07, "loss": 0.005041994154453278, "reward": 2.396875, "reward_std": 0.32111557126045226, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11468750163912773, "rewards/MazeReward/std": 0.14987295717000962, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.4, "completions/mean_length": 108.1421875, "completions/min_length": 42.2, "epoch": 7.136752136752137, "frac_reward_zero_std": 0.8375, "grad_norm": 0.5286700058047759, "kl": 0.12536690728738903, "learning_rate": 6.740277026221922e-07, "loss": 0.005015048757195472, "reward": 2.41875, "reward_std": 0.342887257039547, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11687499880790711, "rewards/MazeReward/std": 0.17734501659870147, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.6, "completions/mean_length": 113.98125, "completions/min_length": 45.8, "epoch": 7.17948717948718, "frac_reward_zero_std": 0.775, "grad_norm": 0.7403959888442324, "kl": 0.10649480815045535, "learning_rate": 6.701465872208216e-07, "loss": 0.004260452091693878, "reward": 2.509375, "reward_std": 0.5740605995059014, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1259375035762787, "rewards/MazeReward/std": 0.1605320692062378, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/mean_length": 126.059375, "completions/min_length": 42.0, "epoch": 7.222222222222222, "frac_reward_zero_std": 0.8625, "grad_norm": 0.8837213501426033, "kl": 0.10494111906737089, "learning_rate": 6.662538425262284e-07, "loss": 0.004197680950164795, "reward": 2.371875, "reward_std": 0.3282697722315788, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11218750327825547, "rewards/MazeReward/std": 0.15610671639442444, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.2, "completions/mean_length": 122.8390625, "completions/min_length": 46.0, "epoch": 7.264957264957265, "frac_reward_zero_std": 0.75, "grad_norm": 0.9404232152801563, "kl": 0.11009907629340887, "learning_rate": 6.623497346023417e-07, "loss": 0.004404155537486076, "reward": 2.6326171875, "reward_std": 0.6332006573677063, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.13843750059604645, "rewards/MazeReward/std": 0.18257658779621125, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.6, "completions/mean_length": 117.6125, "completions/min_length": 46.2, "epoch": 7.3076923076923075, "frac_reward_zero_std": 0.8875, "grad_norm": 0.8842868714790371, "kl": 0.12074681739322841, "learning_rate": 6.584345302897522e-07, "loss": 0.004830294847488403, "reward": 2.678125, "reward_std": 0.22566969692707062, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1428125023841858, "rewards/MazeReward/std": 0.15798466503620148, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.6, "completions/mean_length": 111.078125, "completions/min_length": 44.8, "epoch": 7.35042735042735, "frac_reward_zero_std": 0.95, "grad_norm": 0.12253566912712557, "kl": 0.13014833349734545, "learning_rate": 6.545084971874736e-07, "loss": 0.0052056387066841125, "reward": 2.134375, "reward_std": 0.07553946599364281, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0884375050663948, "rewards/MazeReward/std": 0.1338231921195984, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/mean_length": 114.615625, "completions/min_length": 43.4, "epoch": 7.3931623931623935, "frac_reward_zero_std": 0.95, "grad_norm": 0.9828254166110735, "kl": 0.11990115805529059, "learning_rate": 6.505719036346537e-07, "loss": 0.004796605557203293, "reward": 2.08125, "reward_std": 0.0818540021777153, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.08312500044703483, "rewards/MazeReward/std": 0.1296190157532692, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.8, "completions/mean_length": 120.7828125, "completions/min_length": 45.8, "epoch": 7.435897435897436, "frac_reward_zero_std": 0.85, "grad_norm": 1.5641540692769706, "kl": 0.11739973742514849, "learning_rate": 6.466250186922324e-07, "loss": 0.0046964243054389955, "reward": 2.278125, "reward_std": 0.37824141681194307, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.10281250178813935, "rewards/MazeReward/std": 0.16968013048171998, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.8, "completions/mean_length": 130.378125, "completions/min_length": 48.2, "epoch": 7.478632478632479, "frac_reward_zero_std": 0.85, "grad_norm": 0.9879868922490628, "kl": 0.10477358950302004, "learning_rate": 6.426681121245527e-07, "loss": 0.0041921079158782956, "reward": 2.690625, "reward_std": 0.39118525981903074, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14406249672174454, "rewards/MazeReward/std": 0.17793795466423035, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.6, "completions/mean_length": 136.2828125, "completions/min_length": 48.0, "epoch": 7.521367521367521, "frac_reward_zero_std": 0.8, "grad_norm": 1.2740447986881183, "kl": 0.09708790634758771, "learning_rate": 6.387014543809223e-07, "loss": 0.0038834869861602782, "reward": 2.121875, "reward_std": 0.42704967558383944, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.0871875025331974, "rewards/MazeReward/std": 0.14741710275411607, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 127.08125, "completions/min_length": 48.2, "epoch": 7.564102564102564, "frac_reward_zero_std": 0.775, "grad_norm": 0.8816744498090584, "kl": 0.10946230506524443, "learning_rate": 6.347253165771289e-07, "loss": 0.004379009455442428, "reward": 2.521875, "reward_std": 0.49345279932022096, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1271875023841858, "rewards/MazeReward/std": 0.18064941763877868, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.2, "completions/mean_length": 123.7390625, "completions/min_length": 46.4, "epoch": 7.6068376068376065, "frac_reward_zero_std": 0.8, "grad_norm": 0.7904246843598328, "kl": 0.10876897526904941, "learning_rate": 6.307399704769098e-07, "loss": 0.0043511584401130675, "reward": 2.3984375, "reward_std": 0.4378116011619568, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.11500000059604645, "rewards/MazeReward/std": 0.16837832033634187, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.4, "completions/mean_length": 125.16875, "completions/min_length": 42.2, "epoch": 7.64957264957265, "frac_reward_zero_std": 0.875, "grad_norm": 0.6831600099614877, "kl": 0.11811227649450302, "learning_rate": 6.26745688473377e-07, "loss": 0.004724665731191635, "reward": 2.159375, "reward_std": 0.25841794312000277, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.09093750268220901, "rewards/MazeReward/std": 0.15757765173912047, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.4, "completions/mean_length": 122.315625, "completions/min_length": 43.4, "epoch": 7.6923076923076925, "frac_reward_zero_std": 0.775, "grad_norm": 0.9463456998621641, "kl": 0.12197576817125082, "learning_rate": 6.227427435703995e-07, "loss": 0.004878251254558564, "reward": 2.659375, "reward_std": 0.5090285480022431, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.140937502682209, "rewards/MazeReward/std": 0.18577449023723602, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.4, "completions/mean_length": 124.259375, "completions/min_length": 47.0, "epoch": 7.735042735042735, "frac_reward_zero_std": 0.8125, "grad_norm": 0.9935293007275701, "kl": 0.11919824471697212, "learning_rate": 6.187314093639443e-07, "loss": 0.004768900573253632, "reward": 2.337109375, "reward_std": 0.4040724813938141, "rewards/Format/mean": 0.249609375, "rewards/Format/std": 0.004419417306780815, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1087500050663948, "rewards/MazeReward/std": 0.171444433927536, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.8, "completions/mean_length": 124.9703125, "completions/min_length": 48.8, "epoch": 7.777777777777778, "frac_reward_zero_std": 0.7875, "grad_norm": 0.427378607750444, "kl": 0.10699953152798117, "learning_rate": 6.147119600233758e-07, "loss": 0.0042796477675437926, "reward": 2.640625, "reward_std": 0.5469089686870575, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.13906250447034835, "rewards/MazeReward/std": 0.17976903915405273, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.8, "completions/mean_length": 130.7734375, "completions/min_length": 47.8, "epoch": 7.82051282051282, "frac_reward_zero_std": 0.7875, "grad_norm": 1.114364765131464, "kl": 0.09865478742867709, "learning_rate": 6.106846702727172e-07, "loss": 0.003946560621261597, "reward": 2.365625, "reward_std": 0.5313502073287963, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11156250238418579, "rewards/MazeReward/std": 0.17100094854831696, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 137.840625, "completions/min_length": 49.6, "epoch": 7.863247863247864, "frac_reward_zero_std": 0.825, "grad_norm": 1.1322236029400599, "kl": 0.0958178190048784, "learning_rate": 6.066498153718734e-07, "loss": 0.003832873702049255, "reward": 2.29375, "reward_std": 0.3965612709522247, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.10437500178813934, "rewards/MazeReward/std": 0.159711055457592, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.4, "completions/mean_length": 137.2859375, "completions/min_length": 51.6, "epoch": 7.905982905982906, "frac_reward_zero_std": 0.775, "grad_norm": 0.9033191349982307, "kl": 0.09261846840381623, "learning_rate": 6.026076710978171e-07, "loss": 0.0037041474133729935, "reward": 2.5857421875, "reward_std": 0.520411816239357, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.13375000059604644, "rewards/MazeReward/std": 0.17141990661621093, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 128.3171875, "completions/min_length": 46.4, "epoch": 7.948717948717949, "frac_reward_zero_std": 0.7375, "grad_norm": 1.1026841088488806, "kl": 0.10645872093737126, "learning_rate": 5.985585137257401e-07, "loss": 0.004257069900631905, "reward": 2.884375, "reward_std": 0.6288280010223388, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16343750655651093, "rewards/MazeReward/std": 0.1841311573982239, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.6, "completions/mean_length": 128.8984375, "completions/min_length": 45.8, "epoch": 7.9914529914529915, "frac_reward_zero_std": 0.85, "grad_norm": 1.031998752242879, "kl": 0.10646879714913667, "learning_rate": 5.945026200101702e-07, "loss": 0.004258693009614944, "reward": 2.790625, "reward_std": 0.3497179388999939, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15406250357627868, "rewards/MazeReward/std": 0.18062105774879456, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.6, "completions/mean_length": 136.0609375, "completions/min_length": 44.8, "epoch": 8.034188034188034, "frac_reward_zero_std": 0.7625, "grad_norm": 1.037419249263571, "kl": 0.10459752553142607, "learning_rate": 5.90440267166055e-07, "loss": 0.004184092953801155, "reward": 2.4390625, "reward_std": 0.5330159664154053, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.1190625011920929, "rewards/MazeReward/std": 0.1763996571302414, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.6, "completions/mean_length": 133.3921875, "completions/min_length": 52.2, "epoch": 8.076923076923077, "frac_reward_zero_std": 0.85, "grad_norm": 1.0914915082654844, "kl": 0.10954389749094844, "learning_rate": 5.863717328498152e-07, "loss": 0.004381101951003075, "reward": 2.4982421875, "reward_std": 0.3982539355754852, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.12499999850988389, "rewards/MazeReward/std": 0.1781415581703186, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 133.075, "completions/min_length": 50.8, "epoch": 8.11965811965812, "frac_reward_zero_std": 0.8, "grad_norm": 0.9990611755328862, "kl": 0.10294655775651336, "learning_rate": 5.82297295140367e-07, "loss": 0.004118229448795319, "reward": 2.390625, "reward_std": 0.492414253950119, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11406250298023224, "rewards/MazeReward/std": 0.1696094572544098, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.2, "completions/mean_length": 130.9890625, "completions/min_length": 45.4, "epoch": 8.162393162393162, "frac_reward_zero_std": 0.8625, "grad_norm": 0.501342223466357, "kl": 0.11401240844279528, "learning_rate": 5.782172325201155e-07, "loss": 0.004560865834355355, "reward": 2.165625, "reward_std": 0.3272204905748367, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.09156250134110451, "rewards/MazeReward/std": 0.15920471251010895, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 785.4, "completions/mean_length": 139.0234375, "completions/min_length": 48.4, "epoch": 8.205128205128204, "frac_reward_zero_std": 0.8625, "grad_norm": 0.9085688756311281, "kl": 0.10424431953579187, "learning_rate": 5.741318238559209e-07, "loss": 0.004170581325888633, "reward": 2.45, "reward_std": 0.3110924273729324, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490137964487076, "rewards/MazeReward/mean": 0.12031249850988388, "rewards/MazeReward/std": 0.175260728597641, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.8, "completions/mean_length": 121.1703125, "completions/min_length": 45.6, "epoch": 8.247863247863247, "frac_reward_zero_std": 0.85, "grad_norm": 0.4287849382710462, "kl": 0.11473485874012113, "learning_rate": 5.700413483800389e-07, "loss": 0.004588994011282921, "reward": 2.875, "reward_std": 0.42599530816078185, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16250000298023223, "rewards/MazeReward/std": 0.1969437777996063, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1040.4, "completions/mean_length": 120.865625, "completions/min_length": 44.8, "epoch": 8.290598290598291, "frac_reward_zero_std": 0.85, "grad_norm": 0.4698008767472208, "kl": 0.11099314470775426, "learning_rate": 5.659460856710345e-07, "loss": 0.004439573734998703, "reward": 2.678125, "reward_std": 0.35614879578351977, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.03535533845424652, "rewards/MazeReward/mean": 0.14312500208616258, "rewards/MazeReward/std": 0.17075003683567047, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 679.6, "completions/mean_length": 116.553125, "completions/min_length": 44.4, "epoch": 8.333333333333334, "frac_reward_zero_std": 0.8125, "grad_norm": 1.1246814355740533, "kl": 0.10966736217960715, "learning_rate": 5.618463156346739e-07, "loss": 0.004387399554252625, "reward": 2.5203125, "reward_std": 0.4758760154247284, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.12718750089406966, "rewards/MazeReward/std": 0.17132539451122283, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 643.6, "completions/mean_length": 117.0546875, "completions/min_length": 45.0, "epoch": 8.376068376068377, "frac_reward_zero_std": 0.8125, "grad_norm": 0.9308115350285304, "kl": 0.12060628677718341, "learning_rate": 5.577423184847931e-07, "loss": 0.004825248569250107, "reward": 2.5078125, "reward_std": 0.4877462863922119, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.1259375035762787, "rewards/MazeReward/std": 0.17260952889919282, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/mean_length": 103.9421875, "completions/min_length": 40.8, "epoch": 8.418803418803419, "frac_reward_zero_std": 0.875, "grad_norm": 0.45122353857825026, "kl": 0.12739773923531175, "learning_rate": 5.536343747241459e-07, "loss": 0.005096112936735153, "reward": 2.7076171875, "reward_std": 0.2750555261969566, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.1459375023841858, "rewards/MazeReward/std": 0.1707939773797989, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.4, "completions/mean_length": 108.8203125, "completions/min_length": 44.6, "epoch": 8.461538461538462, "frac_reward_zero_std": 0.9125, "grad_norm": 0.5176552447530166, "kl": 0.13383744256570934, "learning_rate": 5.495227651252315e-07, "loss": 0.0053529292345047, "reward": 2.628125, "reward_std": 0.2354552686214447, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.13781250268220901, "rewards/MazeReward/std": 0.1868099868297577, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.4, "completions/mean_length": 116.4109375, "completions/min_length": 46.4, "epoch": 8.504273504273504, "frac_reward_zero_std": 0.875, "grad_norm": 0.7040039230728856, "kl": 0.12751290397718548, "learning_rate": 5.454077707111041e-07, "loss": 0.005100805312395096, "reward": 2.84375, "reward_std": 0.34772194027900694, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15937500447034836, "rewards/MazeReward/std": 0.2054050385951996, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.8, "completions/mean_length": 125.3171875, "completions/min_length": 44.6, "epoch": 8.547008547008547, "frac_reward_zero_std": 0.8875, "grad_norm": 0.4855991548830803, "kl": 0.11497611114755273, "learning_rate": 5.412896727361662e-07, "loss": 0.004598812013864517, "reward": 2.396875, "reward_std": 0.29418938159942626, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11468749940395355, "rewards/MazeReward/std": 0.15751168578863145, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.6, "completions/mean_length": 129.1109375, "completions/min_length": 48.8, "epoch": 8.58974358974359, "frac_reward_zero_std": 0.8125, "grad_norm": 1.1949849654241222, "kl": 0.11416345727629959, "learning_rate": 5.371687526669439e-07, "loss": 0.004567617923021317, "reward": 2.446875, "reward_std": 0.5055377662181855, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11968750357627869, "rewards/MazeReward/std": 0.17761976420879363, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/mean_length": 135.553125, "completions/min_length": 49.6, "epoch": 8.632478632478632, "frac_reward_zero_std": 0.8125, "grad_norm": 0.8230226369558373, "kl": 0.10625858008861541, "learning_rate": 5.330452921628497e-07, "loss": 0.0042507462203502655, "reward": 2.765625, "reward_std": 0.5198424816131592, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1515625, "rewards/MazeReward/std": 0.19108597040176392, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/mean_length": 132.9265625, "completions/min_length": 48.0, "epoch": 8.675213675213675, "frac_reward_zero_std": 0.825, "grad_norm": 0.683456389574033, "kl": 0.09912157701328397, "learning_rate": 5.28919573056932e-07, "loss": 0.00396508052945137, "reward": 2.821875, "reward_std": 0.4673504412174225, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15718750655651093, "rewards/MazeReward/std": 0.19530395567417144, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.2, "completions/mean_length": 128.9421875, "completions/min_length": 47.4, "epoch": 8.717948717948717, "frac_reward_zero_std": 0.8875, "grad_norm": 0.7555265167378185, "kl": 0.10403793673031032, "learning_rate": 5.247918773366111e-07, "loss": 0.004162249714136123, "reward": 2.9375, "reward_std": 0.2453582763671875, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16875, "rewards/MazeReward/std": 0.1709858000278473, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/mean_length": 135.3453125, "completions/min_length": 48.2, "epoch": 8.760683760683762, "frac_reward_zero_std": 0.8375, "grad_norm": 0.5237248226742194, "kl": 0.10214884807355702, "learning_rate": 5.206624871244065e-07, "loss": 0.004086394608020782, "reward": 2.653125, "reward_std": 0.39622444808483126, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1403125062584877, "rewards/MazeReward/std": 0.18336015641689302, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/mean_length": 144.084375, "completions/min_length": 46.2, "epoch": 8.803418803418804, "frac_reward_zero_std": 0.775, "grad_norm": 0.7706414470502965, "kl": 0.09099415931850671, "learning_rate": 5.165316846586541e-07, "loss": 0.003640550747513771, "reward": 2.846875, "reward_std": 0.5451147437095643, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1596875011920929, "rewards/MazeReward/std": 0.1916624754667282, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.6, "completions/mean_length": 151.68125, "completions/min_length": 53.8, "epoch": 8.846153846153847, "frac_reward_zero_std": 0.9125, "grad_norm": 0.7261257688683561, "kl": 0.09205408911220729, "learning_rate": 5.123997522742151e-07, "loss": 0.0036819610744714738, "reward": 2.78125, "reward_std": 0.2184166505932808, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15312500596046447, "rewards/MazeReward/std": 0.19261254966259003, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.6, "completions/mean_length": 142.8265625, "completions/min_length": 50.0, "epoch": 8.88888888888889, "frac_reward_zero_std": 0.7375, "grad_norm": 1.2739755973652953, "kl": 0.09573275893926621, "learning_rate": 5.082669723831793e-07, "loss": 0.0038291953504085543, "reward": 2.6623046875, "reward_std": 0.6662474989891052, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14125000238418578, "rewards/MazeReward/std": 0.21111677289009095, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.6, "completions/mean_length": 154.9078125, "completions/min_length": 54.6, "epoch": 8.931623931623932, "frac_reward_zero_std": 0.7, "grad_norm": 1.0205023124388044, "kl": 0.0880844673141837, "learning_rate": 5.041336274555625e-07, "loss": 0.003523694723844528, "reward": 2.6294921875, "reward_std": 0.8672631502151489, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.13812500089406968, "rewards/MazeReward/std": 0.21768845319747926, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.2, "completions/mean_length": 153.0296875, "completions/min_length": 52.8, "epoch": 8.974358974358974, "frac_reward_zero_std": 0.8, "grad_norm": 0.8092165727521461, "kl": 0.07932715229690075, "learning_rate": 5e-07, "loss": 0.003172917664051056, "reward": 2.503125, "reward_std": 0.5358554720878601, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.12531249970197678, "rewards/MazeReward/std": 0.17834349870681762, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/mean_length": 152.5109375, "completions/min_length": 55.8, "epoch": 9.017094017094017, "frac_reward_zero_std": 0.725, "grad_norm": 1.3586878566240224, "kl": 0.08491902407258749, "learning_rate": 4.958663725444375e-07, "loss": 0.0033974848687648774, "reward": 2.8109375, "reward_std": 0.677154815196991, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.1562500014901161, "rewards/MazeReward/std": 0.19654485285282136, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.2, "completions/mean_length": 146.475, "completions/min_length": 52.0, "epoch": 9.05982905982906, "frac_reward_zero_std": 0.7625, "grad_norm": 0.34243040085731946, "kl": 0.0910864389501512, "learning_rate": 4.917330276168208e-07, "loss": 0.003643970936536789, "reward": 2.803125, "reward_std": 0.6713367283344269, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1553125038743019, "rewards/MazeReward/std": 0.20383458137512206, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.2, "completions/mean_length": 139.878125, "completions/min_length": 49.8, "epoch": 9.102564102564102, "frac_reward_zero_std": 0.8125, "grad_norm": 0.875129358025704, "kl": 0.10464337235316634, "learning_rate": 4.87600247725785e-07, "loss": 0.004185883700847626, "reward": 3.0625, "reward_std": 0.48923705220222474, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.18125000596046448, "rewards/MazeReward/std": 0.21844838559627533, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.4, "completions/mean_length": 130.3484375, "completions/min_length": 46.2, "epoch": 9.145299145299145, "frac_reward_zero_std": 0.8375, "grad_norm": 0.42757762474877375, "kl": 0.10715236896649002, "learning_rate": 4.834683153413459e-07, "loss": 0.004287224635481835, "reward": 2.64375, "reward_std": 0.37823321521282194, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.13937500417232512, "rewards/MazeReward/std": 0.1645449861884117, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.8, "completions/mean_length": 127.9015625, "completions/min_length": 48.8, "epoch": 9.188034188034187, "frac_reward_zero_std": 0.775, "grad_norm": 1.2347467695458751, "kl": 0.11225857324898243, "learning_rate": 4.793375128755933e-07, "loss": 0.004490474238991737, "reward": 2.990625, "reward_std": 0.6784021079540252, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.17406250536441803, "rewards/MazeReward/std": 0.21663503348827362, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.4, "completions/mean_length": 124.5484375, "completions/min_length": 53.2, "epoch": 9.23076923076923, "frac_reward_zero_std": 0.8, "grad_norm": 0.8469089772634869, "kl": 0.10855783149600029, "learning_rate": 4.752081226633888e-07, "loss": 0.004342434555292129, "reward": 3.11875, "reward_std": 0.5571016371250153, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.18687500953674316, "rewards/MazeReward/std": 0.22037321627140044, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.8, "completions/mean_length": 121.4609375, "completions/min_length": 47.6, "epoch": 9.273504273504274, "frac_reward_zero_std": 0.85, "grad_norm": 0.7604487155971665, "kl": 0.09429952083155513, "learning_rate": 4.71080426943068e-07, "loss": 0.0037718590348958967, "reward": 2.740625, "reward_std": 0.49681556224823, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14906250387430192, "rewards/MazeReward/std": 0.19011184573173523, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.6, "completions/mean_length": 124.4609375, "completions/min_length": 53.2, "epoch": 9.316239316239317, "frac_reward_zero_std": 0.8375, "grad_norm": 0.6965466774930933, "kl": 0.10939432140439749, "learning_rate": 4.669547078371503e-07, "loss": 0.004375176876783371, "reward": 2.921875, "reward_std": 0.46210750937461853, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16718750447034836, "rewards/MazeReward/std": 0.19805112481117249, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.8, "completions/mean_length": 129.3796875, "completions/min_length": 47.2, "epoch": 9.35897435897436, "frac_reward_zero_std": 0.7875, "grad_norm": 0.7159299198254374, "kl": 0.09956523487344385, "learning_rate": 4.628312473330562e-07, "loss": 0.003982530534267425, "reward": 2.7263671875, "reward_std": 0.5215662240982055, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.14781250059604645, "rewards/MazeReward/std": 0.19598830342292786, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.4, "completions/mean_length": 123.275, "completions/min_length": 51.4, "epoch": 9.401709401709402, "frac_reward_zero_std": 0.8625, "grad_norm": 0.8152789797212142, "kl": 0.10890261642634869, "learning_rate": 4.5871032726383385e-07, "loss": 0.004355636984109878, "reward": 2.590625, "reward_std": 0.312482450902462, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1340624988079071, "rewards/MazeReward/std": 0.1762324720621109, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.2, "completions/mean_length": 117.2421875, "completions/min_length": 47.4, "epoch": 9.444444444444445, "frac_reward_zero_std": 0.9125, "grad_norm": 1.0145942446437635, "kl": 0.108840207522735, "learning_rate": 4.5459222928889587e-07, "loss": 0.004354207217693329, "reward": 2.721875, "reward_std": 0.2637569785118103, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14718750417232512, "rewards/MazeReward/std": 0.19373294115066528, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.6, "completions/mean_length": 132.5453125, "completions/min_length": 50.0, "epoch": 9.487179487179487, "frac_reward_zero_std": 0.825, "grad_norm": 1.1849331018616072, "kl": 0.10914051588624715, "learning_rate": 4.5047723487476864e-07, "loss": 0.004366296529769898, "reward": 2.540625, "reward_std": 0.5292814433574676, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1290624976158142, "rewards/MazeReward/std": 0.2007750853896141, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 139.1921875, "completions/min_length": 45.0, "epoch": 9.52991452991453, "frac_reward_zero_std": 0.8125, "grad_norm": 1.286125065843318, "kl": 0.10163588528521358, "learning_rate": 4.463656252758542e-07, "loss": 0.0040661245584487915, "reward": 2.771875, "reward_std": 0.4567874908447266, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1521875038743019, "rewards/MazeReward/std": 0.18613163828849794, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/mean_length": 133.6546875, "completions/min_length": 50.8, "epoch": 9.572649572649572, "frac_reward_zero_std": 0.775, "grad_norm": 1.5016589865441343, "kl": 0.10546655040234328, "learning_rate": 4.4225768151520694e-07, "loss": 0.004218711704015732, "reward": 2.7125, "reward_std": 0.5366943567991257, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14625000357627868, "rewards/MazeReward/std": 0.21170917451381682, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.8, "completions/mean_length": 138.1640625, "completions/min_length": 50.4, "epoch": 9.615384615384615, "frac_reward_zero_std": 0.6875, "grad_norm": 1.1798897236609454, "kl": 0.09572657975368201, "learning_rate": 4.381536843653261e-07, "loss": 0.0038293473422527312, "reward": 2.8375, "reward_std": 0.7899509906768799, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15875000059604644, "rewards/MazeReward/std": 0.20503087043762208, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.8, "completions/mean_length": 130.64375, "completions/min_length": 48.4, "epoch": 9.658119658119658, "frac_reward_zero_std": 0.825, "grad_norm": 0.6998974993851493, "kl": 0.09763574441894889, "learning_rate": 4.340539143289655e-07, "loss": 0.003905288875102997, "reward": 2.571875, "reward_std": 0.45505390167236326, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.13218750134110452, "rewards/MazeReward/std": 0.17721938192844391, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.2, "completions/mean_length": 135.034375, "completions/min_length": 51.2, "epoch": 9.7008547008547, "frac_reward_zero_std": 0.8, "grad_norm": 0.5155982985374374, "kl": 0.09754181425087154, "learning_rate": 4.2995865161996104e-07, "loss": 0.003902355581521988, "reward": 2.928125, "reward_std": 0.5552009463310241, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1678125038743019, "rewards/MazeReward/std": 0.20430524051189422, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.2, "completions/mean_length": 128.05625, "completions/min_length": 52.4, "epoch": 9.743589743589745, "frac_reward_zero_std": 0.825, "grad_norm": 0.7226746944064936, "kl": 0.11315394174307584, "learning_rate": 4.258681761440789e-07, "loss": 0.004526397585868836, "reward": 2.86875, "reward_std": 0.5527862429618835, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16187500059604645, "rewards/MazeReward/std": 0.2129779577255249, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.6, "completions/mean_length": 128.296875, "completions/min_length": 49.4, "epoch": 9.786324786324787, "frac_reward_zero_std": 0.825, "grad_norm": 1.3399570894615993, "kl": 0.106480473279953, "learning_rate": 4.2178276747988444e-07, "loss": 0.0042602140456438065, "reward": 3.1888671875, "reward_std": 0.47525117695331576, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.19406250417232512, "rewards/MazeReward/std": 0.21473246216773986, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.6, "completions/mean_length": 126.440625, "completions/min_length": 49.0, "epoch": 9.82905982905983, "frac_reward_zero_std": 0.85, "grad_norm": 0.09503596331402567, "kl": 0.10468061766587197, "learning_rate": 4.1770270485963294e-07, "loss": 0.004187341779470444, "reward": 2.778125, "reward_std": 0.43286781907081606, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15281250178813935, "rewards/MazeReward/std": 0.19566009640693666, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.2, "completions/mean_length": 130.01875, "completions/min_length": 48.6, "epoch": 9.871794871794872, "frac_reward_zero_std": 0.85, "grad_norm": 0.7235398012781328, "kl": 0.10513886674307286, "learning_rate": 4.1362826715018497e-07, "loss": 0.004206154868006707, "reward": 2.775, "reward_std": 0.40023252964019773, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15250000655651091, "rewards/MazeReward/std": 0.20179781019687654, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.4, "completions/mean_length": 134.5703125, "completions/min_length": 50.4, "epoch": 9.914529914529915, "frac_reward_zero_std": 0.9, "grad_norm": 0.3737954120293668, "kl": 0.09545613052323461, "learning_rate": 4.095597328339452e-07, "loss": 0.0038186319172382353, "reward": 2.584375, "reward_std": 0.29082661867141724, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.13343750238418578, "rewards/MazeReward/std": 0.19001711010932923, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.4, "completions/mean_length": 130.8875, "completions/min_length": 49.8, "epoch": 9.957264957264957, "frac_reward_zero_std": 0.9, "grad_norm": 0.06752438787043205, "kl": 0.09968637404963374, "learning_rate": 4.0549737998982994e-07, "loss": 0.003987422212958336, "reward": 2.4984375, "reward_std": 0.26228815913200376, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.12499999850988389, "rewards/MazeReward/std": 0.16459157764911653, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.6, "completions/mean_length": 132.1, "completions/min_length": 51.6, "epoch": 10.0, "frac_reward_zero_std": 0.8375, "grad_norm": 0.875921562762164, "kl": 0.09964474057778716, "learning_rate": 4.0144148627425986e-07, "loss": 0.003986567258834839, "reward": 2.490625, "reward_std": 0.464725586771965, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.12406250089406967, "rewards/MazeReward/std": 0.18573529720306398, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/mean_length": 136.7765625, "completions/min_length": 49.4, "epoch": 10.042735042735043, "frac_reward_zero_std": 0.8, "grad_norm": 0.9439809021490256, "kl": 0.10586870852857828, "learning_rate": 3.973923289021829e-07, "loss": 0.004234510287642479, "reward": 3.0, "reward_std": 0.6187429428100586, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.17500000596046447, "rewards/MazeReward/std": 0.22677642703056336, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/mean_length": 121.2671875, "completions/min_length": 49.0, "epoch": 10.085470085470085, "frac_reward_zero_std": 0.8125, "grad_norm": 0.524992016977372, "kl": 0.1077304735314101, "learning_rate": 3.9335018462812664e-07, "loss": 0.00430932566523552, "reward": 3.1625, "reward_std": 0.5555283963680268, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19125000089406968, "rewards/MazeReward/std": 0.20346350967884064, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/mean_length": 124.9328125, "completions/min_length": 48.8, "epoch": 10.128205128205128, "frac_reward_zero_std": 0.7625, "grad_norm": 0.8855337385204372, "kl": 0.11209777896292508, "learning_rate": 3.893153297272828e-07, "loss": 0.004483478516340256, "reward": 2.740625, "reward_std": 0.713850450515747, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14906250238418578, "rewards/MazeReward/std": 0.22811269164085388, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.2, "completions/mean_length": 123.625, "completions/min_length": 45.4, "epoch": 10.17094017094017, "frac_reward_zero_std": 0.8625, "grad_norm": 0.9321602133252843, "kl": 0.11280471049249172, "learning_rate": 3.8528803997662423e-07, "loss": 0.004512125626206398, "reward": 2.646875, "reward_std": 0.42052239179611206, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1396875038743019, "rewards/MazeReward/std": 0.19612097144126892, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/mean_length": 118.7015625, "completions/min_length": 46.6, "epoch": 10.213675213675213, "frac_reward_zero_std": 0.85, "grad_norm": 0.5102698579518901, "kl": 0.10920268264599145, "learning_rate": 3.812685906360557e-07, "loss": 0.004368701577186584, "reward": 3.01875, "reward_std": 0.43965511322021483, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.17687500417232513, "rewards/MazeReward/std": 0.1892246425151825, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/mean_length": 121.0359375, "completions/min_length": 46.8, "epoch": 10.256410256410255, "frac_reward_zero_std": 0.8125, "grad_norm": 1.1495768744719015, "kl": 0.11481720227748156, "learning_rate": 3.772572564296004e-07, "loss": 0.00459345206618309, "reward": 2.9375, "reward_std": 0.5186841666698456, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16875000298023224, "rewards/MazeReward/std": 0.21820463240146637, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.6, "completions/mean_length": 122.940625, "completions/min_length": 48.4, "epoch": 10.2991452991453, "frac_reward_zero_std": 0.8625, "grad_norm": 0.6393428851714967, "kl": 0.10959113240242005, "learning_rate": 3.7325431152662294e-07, "loss": 0.004383664578199387, "reward": 2.653125, "reward_std": 0.3888654768466949, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14031250178813934, "rewards/MazeReward/std": 0.18066073656082154, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.4, "completions/mean_length": 120.88125, "completions/min_length": 48.2, "epoch": 10.341880341880342, "frac_reward_zero_std": 0.825, "grad_norm": 0.656354448868076, "kl": 0.1106418407522142, "learning_rate": 3.692600295230901e-07, "loss": 0.004426059126853943, "reward": 2.778125, "reward_std": 0.4997374087572098, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15281250029802323, "rewards/MazeReward/std": 0.18737704753875734, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.8, "completions/mean_length": 119.975, "completions/min_length": 47.6, "epoch": 10.384615384615385, "frac_reward_zero_std": 0.775, "grad_norm": 0.6022781128494344, "kl": 0.1218629932962358, "learning_rate": 3.6527468342287096e-07, "loss": 0.004874389618635178, "reward": 2.503125, "reward_std": 0.5901626765727996, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1253125011920929, "rewards/MazeReward/std": 0.19279995262622834, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.2, "completions/mean_length": 117.75, "completions/min_length": 49.0, "epoch": 10.427350427350428, "frac_reward_zero_std": 0.8625, "grad_norm": 1.0773650446880119, "kl": 0.12203081138432026, "learning_rate": 3.612985456190778e-07, "loss": 0.004882078245282173, "reward": 2.84375, "reward_std": 0.3281531363725662, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15937500149011613, "rewards/MazeReward/std": 0.19335278272628784, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.4, "completions/mean_length": 119.4015625, "completions/min_length": 49.4, "epoch": 10.47008547008547, "frac_reward_zero_std": 0.9, "grad_norm": 0.45989035827119584, "kl": 0.12238692920655012, "learning_rate": 3.5733188787544746e-07, "loss": 0.004895142465829849, "reward": 2.425, "reward_std": 0.30953381955623627, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11750000417232513, "rewards/MazeReward/std": 0.17827851474285125, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 125.1125, "completions/min_length": 47.8, "epoch": 10.512820512820513, "frac_reward_zero_std": 0.85, "grad_norm": 0.4765218732894405, "kl": 0.10526276314631104, "learning_rate": 3.533749813077677e-07, "loss": 0.004211057722568512, "reward": 2.9625, "reward_std": 0.44157418608665466, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.17124999761581422, "rewards/MazeReward/std": 0.20729261934757232, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.4, "completions/mean_length": 131.059375, "completions/min_length": 46.0, "epoch": 10.555555555555555, "frac_reward_zero_std": 0.8625, "grad_norm": 0.06937887152965579, "kl": 0.10313204862177372, "learning_rate": 3.4942809636534633e-07, "loss": 0.004125615209341049, "reward": 2.4125, "reward_std": 0.35109097361564634, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.11625000089406967, "rewards/MazeReward/std": 0.17364375293254852, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/mean_length": 125.6140625, "completions/min_length": 48.6, "epoch": 10.598290598290598, "frac_reward_zero_std": 0.7875, "grad_norm": 1.150941241555127, "kl": 0.110233462927863, "learning_rate": 3.454915028125263e-07, "loss": 0.004409436881542206, "reward": 3.0859375, "reward_std": 0.5376386404037475, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.1837500035762787, "rewards/MazeReward/std": 0.20490942895412445, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.6, "completions/mean_length": 125.146875, "completions/min_length": 44.8, "epoch": 10.64102564102564, "frac_reward_zero_std": 0.8625, "grad_norm": 0.5906612685205014, "kl": 0.10231563309207559, "learning_rate": 3.415654697102478e-07, "loss": 0.004092198982834816, "reward": 2.975, "reward_std": 0.41274300813674925, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.17250000387430192, "rewards/MazeReward/std": 0.20381629168987275, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 712.8, "completions/mean_length": 125.378125, "completions/min_length": 46.0, "epoch": 10.683760683760683, "frac_reward_zero_std": 0.825, "grad_norm": 1.092725237784792, "kl": 0.10686524836346507, "learning_rate": 3.3765026539765827e-07, "loss": 0.004274631291627884, "reward": 3.0890625, "reward_std": 0.4289448082447052, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.18406250476837158, "rewards/MazeReward/std": 0.2021147519350052, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.8, "completions/mean_length": 122.9953125, "completions/min_length": 45.8, "epoch": 10.726495726495726, "frac_reward_zero_std": 0.8125, "grad_norm": 1.315849668651416, "kl": 0.10811927812173963, "learning_rate": 3.337461574737716e-07, "loss": 0.004324822500348091, "reward": 2.809375, "reward_std": 0.5549110531806946, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1559375047683716, "rewards/MazeReward/std": 0.2119748830795288, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.6, "completions/mean_length": 119.8640625, "completions/min_length": 45.4, "epoch": 10.76923076923077, "frac_reward_zero_std": 0.775, "grad_norm": 1.0598547332375918, "kl": 0.10716007966548205, "learning_rate": 3.2985341277917846e-07, "loss": 0.004287507385015488, "reward": 2.8419921875, "reward_std": 0.6813661992549896, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.159375, "rewards/MazeReward/std": 0.2184309720993042, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.2, "completions/mean_length": 121.6578125, "completions/min_length": 46.2, "epoch": 10.811965811965813, "frac_reward_zero_std": 0.775, "grad_norm": 1.0596037288036213, "kl": 0.11861755037680269, "learning_rate": 3.2597229737780774e-07, "loss": 0.004745027422904969, "reward": 2.846875, "reward_std": 0.6191704094409942, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1596875011920929, "rewards/MazeReward/std": 0.21988584697246552, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.4, "completions/mean_length": 121.2828125, "completions/min_length": 47.4, "epoch": 10.854700854700855, "frac_reward_zero_std": 0.7875, "grad_norm": 0.8639156915181337, "kl": 0.11051362464204431, "learning_rate": 3.221030765387417e-07, "loss": 0.004420583695173263, "reward": 2.965625, "reward_std": 0.526892964541912, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.17156250178813934, "rewards/MazeReward/std": 0.19438618421554565, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.8, "completions/mean_length": 123.58125, "completions/min_length": 47.4, "epoch": 10.897435897435898, "frac_reward_zero_std": 0.9, "grad_norm": 1.015274606144164, "kl": 0.10902460129000247, "learning_rate": 3.1824601471808497e-07, "loss": 0.00436118096113205, "reward": 2.9625, "reward_std": 0.2762803971767426, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.17125000357627868, "rewards/MazeReward/std": 0.20243596732616426, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.2, "completions/mean_length": 124.1890625, "completions/min_length": 46.0, "epoch": 10.94017094017094, "frac_reward_zero_std": 0.7875, "grad_norm": 0.8250612326370127, "kl": 0.10001396774314344, "learning_rate": 3.1440137554088953e-07, "loss": 0.0040006622672081, "reward": 2.6875, "reward_std": 0.5732998341321945, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14375000596046447, "rewards/MazeReward/std": 0.18184928297996522, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.2, "completions/mean_length": 125.153125, "completions/min_length": 47.0, "epoch": 10.982905982905983, "frac_reward_zero_std": 0.775, "grad_norm": 0.8773745189185551, "kl": 0.10550105930306017, "learning_rate": 3.1056942178313604e-07, "loss": 0.004220445826649666, "reward": 2.759375, "reward_std": 0.6357157766819, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1509375035762787, "rewards/MazeReward/std": 0.1942424565553665, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.8, "completions/mean_length": 124.2421875, "completions/min_length": 46.2, "epoch": 11.025641025641026, "frac_reward_zero_std": 0.8625, "grad_norm": 0.6622838321365806, "kl": 0.1016877539921552, "learning_rate": 3.06750415353774e-07, "loss": 0.004067954793572426, "reward": 3.0125, "reward_std": 0.44894702434539796, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1762499988079071, "rewards/MazeReward/std": 0.20109855830669404, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.4, "completions/mean_length": 129.7203125, "completions/min_length": 47.8, "epoch": 11.068376068376068, "frac_reward_zero_std": 0.7625, "grad_norm": 0.9540298385632097, "kl": 0.10531652322970331, "learning_rate": 3.029446172768193e-07, "loss": 0.0042134784162044525, "reward": 3.209375, "reward_std": 0.7137496113777161, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19593750238418578, "rewards/MazeReward/std": 0.2500693678855896, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.6, "completions/mean_length": 126.8359375, "completions/min_length": 46.2, "epoch": 11.11111111111111, "frac_reward_zero_std": 0.8625, "grad_norm": 0.8697472365022582, "kl": 0.09979213373735547, "learning_rate": 2.9915228767351535e-07, "loss": 0.003992287442088127, "reward": 3.35625, "reward_std": 0.40894377380609515, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2106250047683716, "rewards/MazeReward/std": 0.2234483391046524, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.2, "completions/mean_length": 125.2, "completions/min_length": 46.8, "epoch": 11.153846153846153, "frac_reward_zero_std": 0.925, "grad_norm": 0.8038699956286568, "kl": 0.10371305770240724, "learning_rate": 2.9537368574455303e-07, "loss": 0.004148208349943161, "reward": 2.81875, "reward_std": 0.17559237778186798, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15687500238418578, "rewards/MazeReward/std": 0.18436635434627532, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.4, "completions/mean_length": 131.8859375, "completions/min_length": 45.0, "epoch": 11.196581196581196, "frac_reward_zero_std": 0.8, "grad_norm": 1.1685515632535388, "kl": 0.1057190123014152, "learning_rate": 2.916090697523549e-07, "loss": 0.004228459671139717, "reward": 2.9125, "reward_std": 0.6130844354629517, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16625000387430192, "rewards/MazeReward/std": 0.2074292153120041, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/mean_length": 132.5921875, "completions/min_length": 44.8, "epoch": 11.239316239316238, "frac_reward_zero_std": 0.8, "grad_norm": 0.9525409732104532, "kl": 0.09947788114659488, "learning_rate": 2.878586970034232e-07, "loss": 0.0039792083203792575, "reward": 2.715625, "reward_std": 0.6362109780311584, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14656249880790712, "rewards/MazeReward/std": 0.20846615433692933, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.8, "completions/mean_length": 132.5765625, "completions/min_length": 48.0, "epoch": 11.282051282051283, "frac_reward_zero_std": 0.8125, "grad_norm": 1.012743064388902, "kl": 0.09928330578841268, "learning_rate": 2.841228238307536e-07, "loss": 0.003972093015909195, "reward": 2.8421875, "reward_std": 0.5440541952848434, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.159375, "rewards/MazeReward/std": 0.20577182173728942, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/mean_length": 126.5203125, "completions/min_length": 49.8, "epoch": 11.324786324786325, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5037648450165342, "kl": 0.10580486245453358, "learning_rate": 2.8040170557631485e-07, "loss": 0.0042315319180488585, "reward": 2.6625, "reward_std": 0.5498288422822952, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14124999940395355, "rewards/MazeReward/std": 0.2035118579864502, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.4, "completions/mean_length": 122.934375, "completions/min_length": 45.0, "epoch": 11.367521367521368, "frac_reward_zero_std": 0.8875, "grad_norm": 1.0345783179989916, "kl": 0.10461925230920315, "learning_rate": 2.7669559657359673e-07, "loss": 0.004185299202799797, "reward": 2.89375, "reward_std": 0.33696596026420594, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1643750011920929, "rewards/MazeReward/std": 0.1924690842628479, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/mean_length": 117.09375, "completions/min_length": 44.4, "epoch": 11.41025641025641, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6335818415855536, "kl": 0.11745278518646955, "learning_rate": 2.730047501302266e-07, "loss": 0.004698439687490463, "reward": 2.909375, "reward_std": 0.5527952641248703, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16593750119209288, "rewards/MazeReward/std": 0.22284969091415405, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.4, "completions/mean_length": 115.3859375, "completions/min_length": 47.0, "epoch": 11.452991452991453, "frac_reward_zero_std": 0.875, "grad_norm": 0.13903078959487353, "kl": 0.12655633548274636, "learning_rate": 2.6932941851065615e-07, "loss": 0.005062433332204819, "reward": 3.159375, "reward_std": 0.3380592703819275, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19093750417232513, "rewards/MazeReward/std": 0.22119204699993134, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.4, "completions/mean_length": 110.978125, "completions/min_length": 44.0, "epoch": 11.495726495726496, "frac_reward_zero_std": 0.75, "grad_norm": 0.972347935406839, "kl": 0.12469737268984318, "learning_rate": 2.656698529189193e-07, "loss": 0.004987946525216102, "reward": 2.85, "reward_std": 0.7588642656803131, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15999999791383743, "rewards/MazeReward/std": 0.20993177592754364, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.8, "completions/mean_length": 109.29375, "completions/min_length": 45.8, "epoch": 11.538461538461538, "frac_reward_zero_std": 0.8875, "grad_norm": 0.5534386973615292, "kl": 0.12874501328915358, "learning_rate": 2.620263034814632e-07, "loss": 0.005150691419839859, "reward": 2.846875, "reward_std": 0.3248968005180359, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15968750715255736, "rewards/MazeReward/std": 0.20917012691497802, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 634.2, "completions/mean_length": 108.83125, "completions/min_length": 45.6, "epoch": 11.581196581196581, "frac_reward_zero_std": 0.825, "grad_norm": 0.9414528186973988, "kl": 0.11885394980199634, "learning_rate": 2.58399019230052e-07, "loss": 0.004754256457090378, "reward": 3.0359375, "reward_std": 0.4852349281311035, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.17875000238418579, "rewards/MazeReward/std": 0.211050084233284, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 666.6, "completions/mean_length": 114.1015625, "completions/min_length": 47.6, "epoch": 11.623931623931623, "frac_reward_zero_std": 0.825, "grad_norm": 1.1805673739035394, "kl": 0.12760760858654976, "learning_rate": 2.547882480847461e-07, "loss": 0.005104497820138931, "reward": 2.9171875, "reward_std": 0.5961668491363525, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.16687500178813935, "rewards/MazeReward/std": 0.21798062324523926, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/mean_length": 117.7140625, "completions/min_length": 46.8, "epoch": 11.666666666666666, "frac_reward_zero_std": 0.7625, "grad_norm": 1.1715898226877905, "kl": 0.1191184351220727, "learning_rate": 2.5119423683695657e-07, "loss": 0.004765240848064423, "reward": 3.478125, "reward_std": 0.7005012273788452, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2228125035762787, "rewards/MazeReward/std": 0.25120378732681276, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/mean_length": 114.2796875, "completions/min_length": 47.2, "epoch": 11.709401709401709, "frac_reward_zero_std": 0.8375, "grad_norm": 0.8756316340865241, "kl": 0.11611179038882255, "learning_rate": 2.476172311325783e-07, "loss": 0.004644834622740746, "reward": 3.053125, "reward_std": 0.5067808628082275, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.18031250536441804, "rewards/MazeReward/std": 0.23398211300373079, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/mean_length": 114.0765625, "completions/min_length": 46.2, "epoch": 11.752136752136753, "frac_reward_zero_std": 0.7875, "grad_norm": 1.2590183342356303, "kl": 0.11228699986822903, "learning_rate": 2.440574754551996e-07, "loss": 0.0044922705739736555, "reward": 3.503125, "reward_std": 0.6268679082393647, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2253125101327896, "rewards/MazeReward/std": 0.2107793927192688, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.8, "completions/mean_length": 114.615625, "completions/min_length": 45.6, "epoch": 11.794871794871796, "frac_reward_zero_std": 0.825, "grad_norm": 1.1992929533345873, "kl": 0.12027762932702898, "learning_rate": 2.4051521310939254e-07, "loss": 0.004810808598995209, "reward": 2.684375, "reward_std": 0.46283130049705506, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.14343750327825547, "rewards/MazeReward/std": 0.20171170830726623, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.2, "completions/mean_length": 119.6328125, "completions/min_length": 48.6, "epoch": 11.837606837606838, "frac_reward_zero_std": 0.7875, "grad_norm": 1.0103995796070642, "kl": 0.11282904949039221, "learning_rate": 2.3699068620408301e-07, "loss": 0.004513732343912125, "reward": 2.925, "reward_std": 0.6301033198833466, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16750000268220902, "rewards/MazeReward/std": 0.2202708065509796, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.6, "completions/mean_length": 117.1765625, "completions/min_length": 46.2, "epoch": 11.88034188034188, "frac_reward_zero_std": 0.8125, "grad_norm": 0.9374329600105505, "kl": 0.10730244438163936, "learning_rate": 2.3348413563600323e-07, "loss": 0.004292643815279007, "reward": 3.290625, "reward_std": 0.557636970281601, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20406250059604644, "rewards/MazeReward/std": 0.22283052206039428, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.4, "completions/mean_length": 122.728125, "completions/min_length": 48.6, "epoch": 11.923076923076923, "frac_reward_zero_std": 0.775, "grad_norm": 0.7025300585187501, "kl": 0.11257844744250178, "learning_rate": 2.2999580107322654e-07, "loss": 0.004503637924790383, "reward": 2.625, "reward_std": 0.7034467041492463, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.13750000596046447, "rewards/MazeReward/std": 0.2158157378435135, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.2, "completions/mean_length": 119.29375, "completions/min_length": 48.8, "epoch": 11.965811965811966, "frac_reward_zero_std": 0.7875, "grad_norm": 0.9630107428037026, "kl": 0.11099038491956889, "learning_rate": 2.2652592093878665e-07, "loss": 0.004440478980541229, "reward": 3.009375, "reward_std": 0.6271803140640259, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.17593750655651091, "rewards/MazeReward/std": 0.2160946547985077, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/mean_length": 120.184375, "completions/min_length": 46.8, "epoch": 12.008547008547009, "frac_reward_zero_std": 0.8125, "grad_norm": 1.0502469988141634, "kl": 0.10649899104610086, "learning_rate": 2.2307473239438152e-07, "loss": 0.0042601808905601505, "reward": 3.21875, "reward_std": 0.5823124885559082, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.196875, "rewards/MazeReward/std": 0.2266264945268631, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.4, "completions/mean_length": 117.7, "completions/min_length": 50.8, "epoch": 12.051282051282051, "frac_reward_zero_std": 0.8625, "grad_norm": 1.4774073020856746, "kl": 0.1107333465013653, "learning_rate": 2.1964247132416368e-07, "loss": 0.004429550841450691, "reward": 2.475, "reward_std": 0.4280003309249878, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.12250000089406968, "rewards/MazeReward/std": 0.19068382680416107, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.4, "completions/mean_length": 114.815625, "completions/min_length": 48.4, "epoch": 12.094017094017094, "frac_reward_zero_std": 0.7625, "grad_norm": 0.8023690241930199, "kl": 0.11019802512601018, "learning_rate": 2.1622937231861822e-07, "loss": 0.004407884925603867, "reward": 2.846875, "reward_std": 0.7527072727680206, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1596875011920929, "rewards/MazeReward/std": 0.21881043016910554, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.8, "completions/mean_length": 110.5859375, "completions/min_length": 47.0, "epoch": 12.136752136752136, "frac_reward_zero_std": 0.7625, "grad_norm": 1.1439206208019748, "kl": 0.1163951527327299, "learning_rate": 2.128356686585282e-07, "loss": 0.004655531421303749, "reward": 3.15, "reward_std": 0.727622926235199, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19000000655651092, "rewards/MazeReward/std": 0.23026613295078277, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.6, "completions/mean_length": 113.9546875, "completions/min_length": 48.4, "epoch": 12.179487179487179, "frac_reward_zero_std": 0.8625, "grad_norm": 0.07746796563807994, "kl": 0.11500273984856904, "learning_rate": 2.0946159229903088e-07, "loss": 0.004600329324603081, "reward": 2.984375, "reward_std": 0.4566905677318573, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1734375, "rewards/MazeReward/std": 0.21304075717926024, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.6, "completions/mean_length": 113.6953125, "completions/min_length": 44.0, "epoch": 12.222222222222221, "frac_reward_zero_std": 0.75, "grad_norm": 0.9638497800872682, "kl": 0.1109996922314167, "learning_rate": 2.0610737385376348e-07, "loss": 0.004439861327409744, "reward": 2.915625, "reward_std": 0.7669902503490448, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16656250059604644, "rewards/MazeReward/std": 0.21833944022655488, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.4, "completions/mean_length": 118.51875, "completions/min_length": 47.2, "epoch": 12.264957264957266, "frac_reward_zero_std": 0.6875, "grad_norm": 0.9058044499170169, "kl": 0.11539947343990206, "learning_rate": 2.0277324257910106e-07, "loss": 0.0046162322163581845, "reward": 2.8, "reward_std": 1.0353084444999694, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15499999821186067, "rewards/MazeReward/std": 0.2393491506576538, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 121.1828125, "completions/min_length": 48.0, "epoch": 12.307692307692308, "frac_reward_zero_std": 0.8375, "grad_norm": 1.4637020383260568, "kl": 0.11225917679257691, "learning_rate": 1.9945942635848745e-07, "loss": 0.004491125792264938, "reward": 2.90625, "reward_std": 0.4800775945186615, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1656249940395355, "rewards/MazeReward/std": 0.20969883799552919, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.2, "completions/mean_length": 120.4625, "completions/min_length": 46.8, "epoch": 12.350427350427351, "frac_reward_zero_std": 0.825, "grad_norm": 0.6810633086759158, "kl": 0.10299722058698535, "learning_rate": 1.9616615168685942e-07, "loss": 0.004119713604450226, "reward": 3.08125, "reward_std": 0.5347940564155579, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.18312500715255736, "rewards/MazeReward/std": 0.22275058329105377, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.2, "completions/mean_length": 116.7078125, "completions/min_length": 45.4, "epoch": 12.393162393162394, "frac_reward_zero_std": 0.9125, "grad_norm": 0.7530645981845013, "kl": 0.12037030216306448, "learning_rate": 1.9289364365516607e-07, "loss": 0.004815234988927841, "reward": 3.05625, "reward_std": 0.2405204713344574, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.18062500655651093, "rewards/MazeReward/std": 0.19829190224409105, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 121.1796875, "completions/min_length": 45.6, "epoch": 12.435897435897436, "frac_reward_zero_std": 0.8875, "grad_norm": 0.8855228807329661, "kl": 0.11501931981183589, "learning_rate": 1.896421259349844e-07, "loss": 0.004600993543863297, "reward": 2.825, "reward_std": 0.35327176451683046, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15750000178813933, "rewards/MazeReward/std": 0.19714967012405396, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.6, "completions/mean_length": 122.2359375, "completions/min_length": 50.6, "epoch": 12.478632478632479, "frac_reward_zero_std": 0.8125, "grad_norm": 1.0707229873925135, "kl": 0.10961648882366717, "learning_rate": 1.8641182076323148e-07, "loss": 0.00438457727432251, "reward": 3.20625, "reward_std": 0.5865340948104858, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1956250011920929, "rewards/MazeReward/std": 0.24408069550991057, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/mean_length": 122.0171875, "completions/min_length": 48.0, "epoch": 12.521367521367521, "frac_reward_zero_std": 0.8375, "grad_norm": 0.8344007462948576, "kl": 0.10531781297177076, "learning_rate": 1.8320294892697475e-07, "loss": 0.004212798923254013, "reward": 2.86875, "reward_std": 0.47283042669296266, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.16187499910593034, "rewards/MazeReward/std": 0.20645065903663634, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.4, "completions/mean_length": 126.6984375, "completions/min_length": 46.6, "epoch": 12.564102564102564, "frac_reward_zero_std": 0.825, "grad_norm": 0.7680317788100143, "kl": 0.11068193083629012, "learning_rate": 1.8001572974834168e-07, "loss": 0.004427079856395721, "reward": 2.95625, "reward_std": 0.4864831566810608, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1706250101327896, "rewards/MazeReward/std": 0.2195913314819336, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.8, "completions/mean_length": 126.28125, "completions/min_length": 48.8, "epoch": 12.606837606837606, "frac_reward_zero_std": 0.85, "grad_norm": 0.8213676769472825, "kl": 0.10914504565298558, "learning_rate": 1.768503810695295e-07, "loss": 0.004366424679756164, "reward": 3.153125, "reward_std": 0.48257118463516235, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19031250178813935, "rewards/MazeReward/std": 0.22344300746917725, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.8, "completions/mean_length": 129.70625, "completions/min_length": 47.6, "epoch": 12.649572649572649, "frac_reward_zero_std": 0.85, "grad_norm": 0.4507625002743985, "kl": 0.1023398591671139, "learning_rate": 1.7370711923791564e-07, "loss": 0.004094987362623215, "reward": 3.3, "reward_std": 0.45387778878211976, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20500000417232514, "rewards/MazeReward/std": 0.23486512005329133, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.8, "completions/mean_length": 125.1671875, "completions/min_length": 49.4, "epoch": 12.692307692307692, "frac_reward_zero_std": 0.9, "grad_norm": 0.49743670335935936, "kl": 0.10411405102349817, "learning_rate": 1.70586159091271e-07, "loss": 0.004164828360080719, "reward": 3.1796875, "reward_std": 0.21673506498336792, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.1931249976158142, "rewards/MazeReward/std": 0.21075069308280944, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.4, "completions/mean_length": 132.65625, "completions/min_length": 49.2, "epoch": 12.735042735042736, "frac_reward_zero_std": 0.725, "grad_norm": 1.310487091991775, "kl": 0.10584182045422494, "learning_rate": 1.674877139430758e-07, "loss": 0.0042341239750385284, "reward": 3.259375, "reward_std": 0.8526325225830078, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20093750655651094, "rewards/MazeReward/std": 0.23574597835540773, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.8, "completions/mean_length": 133.7375, "completions/min_length": 49.0, "epoch": 12.777777777777779, "frac_reward_zero_std": 0.7375, "grad_norm": 1.1524197214167893, "kl": 0.10455569853074849, "learning_rate": 1.6441199556794034e-07, "loss": 0.0041828140616416935, "reward": 3.3875, "reward_std": 0.8112789869308472, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21375000178813935, "rewards/MazeReward/std": 0.24604512751102448, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/mean_length": 134.11875, "completions/min_length": 49.6, "epoch": 12.820512820512821, "frac_reward_zero_std": 0.8, "grad_norm": 1.0393822846213643, "kl": 0.10254335454665124, "learning_rate": 1.6135921418712955e-07, "loss": 0.004102487117052078, "reward": 3.134375, "reward_std": 0.595169198513031, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1884375035762787, "rewards/MazeReward/std": 0.22971762716770172, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.6, "completions/mean_length": 132.5453125, "completions/min_length": 46.4, "epoch": 12.863247863247864, "frac_reward_zero_std": 0.8625, "grad_norm": 1.0572866774601455, "kl": 0.10014389199204743, "learning_rate": 1.5832957845419582e-07, "loss": 0.0040058080106973645, "reward": 3.2625, "reward_std": 0.4272655785083771, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20125000178813934, "rewards/MazeReward/std": 0.20636436045169831, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 713.0, "completions/mean_length": 135.6796875, "completions/min_length": 48.0, "epoch": 12.905982905982906, "frac_reward_zero_std": 0.7875, "grad_norm": 0.9228517594432902, "kl": 0.1012410223018378, "learning_rate": 1.553232954407171e-07, "loss": 0.00405019074678421, "reward": 2.9953125, "reward_std": 0.6427501201629638, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.17468750476837158, "rewards/MazeReward/std": 0.2177804708480835, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/mean_length": 134.66875, "completions/min_length": 48.0, "epoch": 12.948717948717949, "frac_reward_zero_std": 0.8375, "grad_norm": 0.7058154484867393, "kl": 0.10501653309911489, "learning_rate": 1.52340570622144e-07, "loss": 0.004200476035475731, "reward": 3.4466796875, "reward_std": 0.516084223985672, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21968749761581421, "rewards/MazeReward/std": 0.23806648850440978, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.6, "completions/mean_length": 134.31875, "completions/min_length": 48.6, "epoch": 12.991452991452991, "frac_reward_zero_std": 0.875, "grad_norm": 0.44572165390446444, "kl": 0.1043540752492845, "learning_rate": 1.493816078637557e-07, "loss": 0.004174098372459412, "reward": 3.203125, "reward_std": 0.4287213921546936, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19531250298023223, "rewards/MazeReward/std": 0.24020220935344697, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.4, "completions/mean_length": 136.5453125, "completions/min_length": 50.6, "epoch": 13.034188034188034, "frac_reward_zero_std": 0.8, "grad_norm": 1.1601946120753088, "kl": 0.10335386814549566, "learning_rate": 1.4644660940672627e-07, "loss": 0.004134422540664673, "reward": 3.31875, "reward_std": 0.7683353304862977, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2068750023841858, "rewards/MazeReward/std": 0.25922371447086334, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.8, "completions/mean_length": 131.1671875, "completions/min_length": 47.6, "epoch": 13.076923076923077, "frac_reward_zero_std": 0.775, "grad_norm": 0.9508085146281441, "kl": 0.10196628724224865, "learning_rate": 1.435357758543015e-07, "loss": 0.004078666120767594, "reward": 3.553125, "reward_std": 0.7546874701976776, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23031250536441802, "rewards/MazeReward/std": 0.2488747239112854, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.6, "completions/mean_length": 132.328125, "completions/min_length": 51.2, "epoch": 13.11965811965812, "frac_reward_zero_std": 0.8625, "grad_norm": 0.9270066606620684, "kl": 0.09956136830151081, "learning_rate": 1.4064930615808806e-07, "loss": 0.003982898965477944, "reward": 3.271875, "reward_std": 0.40897447764873507, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20218750536441804, "rewards/MazeReward/std": 0.2285274773836136, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.8, "completions/mean_length": 135.9625, "completions/min_length": 50.6, "epoch": 13.162393162393162, "frac_reward_zero_std": 0.7875, "grad_norm": 0.612416092717352, "kl": 0.09921041326597332, "learning_rate": 1.3778739760445552e-07, "loss": 0.003968718647956848, "reward": 2.96875, "reward_std": 0.7903600454330444, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.171875, "rewards/MazeReward/std": 0.23524323403835296, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.4, "completions/mean_length": 137.615625, "completions/min_length": 50.0, "epoch": 13.205128205128204, "frac_reward_zero_std": 0.8625, "grad_norm": 1.0284612331588838, "kl": 0.10387767017818987, "learning_rate": 1.349502458010519e-07, "loss": 0.004155702888965607, "reward": 3.121875, "reward_std": 0.496600079536438, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.18718750178813934, "rewards/MazeReward/std": 0.24737907946109772, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.4, "completions/mean_length": 135.1421875, "completions/min_length": 48.0, "epoch": 13.247863247863247, "frac_reward_zero_std": 0.8125, "grad_norm": 1.1606217573456257, "kl": 0.10176367992535233, "learning_rate": 1.321380446634342e-07, "loss": 0.004071044176816941, "reward": 3.0576171875, "reward_std": 0.622650396823883, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.18093750774860382, "rewards/MazeReward/std": 0.23252680003643036, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.6, "completions/mean_length": 136.353125, "completions/min_length": 52.4, "epoch": 13.290598290598291, "frac_reward_zero_std": 0.825, "grad_norm": 0.05934888405961084, "kl": 0.10582958087325096, "learning_rate": 1.2935098640181457e-07, "loss": 0.0042334794998168945, "reward": 3.5515625, "reward_std": 0.564618581533432, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.23031250536441802, "rewards/MazeReward/std": 0.23733251690864562, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.8, "completions/mean_length": 140.5203125, "completions/min_length": 54.2, "epoch": 13.333333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 0.8967278127436061, "kl": 0.09393652775324882, "learning_rate": 1.2658926150792322e-07, "loss": 0.00375819131731987, "reward": 3.31875, "reward_std": 0.811944055557251, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20687500536441802, "rewards/MazeReward/std": 0.2688955247402191, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/mean_length": 143.8890625, "completions/min_length": 51.4, "epoch": 13.376068376068377, "frac_reward_zero_std": 0.9, "grad_norm": 0.6689887124628756, "kl": 0.09893502001650631, "learning_rate": 1.2385305874198775e-07, "loss": 0.003957881778478623, "reward": 3.55, "reward_std": 0.3778967708349228, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23000000715255736, "rewards/MazeReward/std": 0.22691603899002075, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.2, "completions/mean_length": 136.734375, "completions/min_length": 49.6, "epoch": 13.418803418803419, "frac_reward_zero_std": 0.8375, "grad_norm": 0.8305684758545985, "kl": 0.08917890437878669, "learning_rate": 1.2114256511983274e-07, "loss": 0.003567250818014145, "reward": 3.24375, "reward_std": 0.5553144633769989, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19937500953674317, "rewards/MazeReward/std": 0.237774994969368, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.2, "completions/mean_length": 134.6171875, "completions/min_length": 55.0, "epoch": 13.461538461538462, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6341620757128188, "kl": 0.1032931875437498, "learning_rate": 1.1845796590009683e-07, "loss": 0.0041314858943223955, "reward": 3.425, "reward_std": 0.6744720757007598, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21750000417232512, "rewards/MazeReward/std": 0.25507332682609557, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.4, "completions/mean_length": 133.471875, "completions/min_length": 54.2, "epoch": 13.504273504273504, "frac_reward_zero_std": 0.825, "grad_norm": 0.4614397177704931, "kl": 0.09606116027571261, "learning_rate": 1.1579944457157059e-07, "loss": 0.0038429252803325654, "reward": 3.584375, "reward_std": 0.528683266043663, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2334374964237213, "rewards/MazeReward/std": 0.25245185792446134, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 727.8, "completions/mean_length": 144.0140625, "completions/min_length": 50.0, "epoch": 13.547008547008547, "frac_reward_zero_std": 0.8125, "grad_norm": 1.0238721759018097, "kl": 0.09148284844122827, "learning_rate": 1.1316718284065535e-07, "loss": 0.0036599829792976378, "reward": 3.384375, "reward_std": 0.6477352797985076, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.996875, "rewards/MazeFormat/std": 0.02490137964487076, "rewards/MazeReward/mean": 0.21375000923871995, "rewards/MazeReward/std": 0.21816131472587585, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/mean_length": 138.340625, "completions/min_length": 55.0, "epoch": 13.58974358974359, "frac_reward_zero_std": 0.7375, "grad_norm": 0.9521251840405882, "kl": 0.09690871224738658, "learning_rate": 1.1056136061894384e-07, "loss": 0.0038771606981754304, "reward": 3.640625, "reward_std": 0.9544131755828857, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23906249701976776, "rewards/MazeReward/std": 0.26780156791210175, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 684.0, "completions/mean_length": 141.0671875, "completions/min_length": 50.2, "epoch": 13.632478632478632, "frac_reward_zero_std": 0.8375, "grad_norm": 0.04557115783910534, "kl": 0.09328375519253314, "learning_rate": 1.0798215601092353e-07, "loss": 0.0037319328635931014, "reward": 3.0359375, "reward_std": 0.41956892013549807, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.17875000238418579, "rewards/MazeReward/std": 0.21701315343379973, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.2, "completions/mean_length": 131.0828125, "completions/min_length": 49.4, "epoch": 13.675213675213675, "frac_reward_zero_std": 0.8, "grad_norm": 1.2025631515659523, "kl": 0.10294593628495932, "learning_rate": 1.0542974530180327e-07, "loss": 0.0041183046996593475, "reward": 3.2875, "reward_std": 0.6173423409461976, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20375000387430192, "rewards/MazeReward/std": 0.22543151676654816, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 698.4, "completions/mean_length": 135.4375, "completions/min_length": 46.4, "epoch": 13.717948717948717, "frac_reward_zero_std": 0.8, "grad_norm": 0.7796861515777691, "kl": 0.10390158286318182, "learning_rate": 1.0290430294546448e-07, "loss": 0.004156334698200226, "reward": 3.3857421875, "reward_std": 0.7448040068149566, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.21375000476837158, "rewards/MazeReward/std": 0.2573282241821289, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.8, "completions/mean_length": 130.2, "completions/min_length": 50.4, "epoch": 13.760683760683762, "frac_reward_zero_std": 0.7875, "grad_norm": 0.98910838336713, "kl": 0.10653561083599924, "learning_rate": 1.0040600155253764e-07, "loss": 0.004261004179716111, "reward": 3.246875, "reward_std": 0.6785987883806228, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19968749582767487, "rewards/MazeReward/std": 0.2529229700565338, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/mean_length": 124.7015625, "completions/min_length": 46.2, "epoch": 13.803418803418804, "frac_reward_zero_std": 0.8625, "grad_norm": 0.6280736093035446, "kl": 0.11661676904186606, "learning_rate": 9.793501187860431e-08, "loss": 0.004665160179138183, "reward": 3.175, "reward_std": 0.45869363844394684, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19250000417232513, "rewards/MazeReward/std": 0.2327058345079422, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.8, "completions/mean_length": 128.015625, "completions/min_length": 47.8, "epoch": 13.846153846153847, "frac_reward_zero_std": 0.825, "grad_norm": 1.1229957421935337, "kl": 0.10365095781162381, "learning_rate": 9.549150281252632e-08, "loss": 0.004146835952997208, "reward": 3.365625, "reward_std": 0.585090035200119, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21156250238418578, "rewards/MazeReward/std": 0.24423626065254211, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.4, "completions/mean_length": 129.1359375, "completions/min_length": 49.0, "epoch": 13.88888888888889, "frac_reward_zero_std": 0.8, "grad_norm": 1.3051344498253252, "kl": 0.11281033270061017, "learning_rate": 9.307564136490254e-08, "loss": 0.004512753337621689, "reward": 3.371875, "reward_std": 0.6207172274589539, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21218750774860382, "rewards/MazeReward/std": 0.24975141286849975, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/mean_length": 126.565625, "completions/min_length": 47.0, "epoch": 13.931623931623932, "frac_reward_zero_std": 0.875, "grad_norm": 0.40871228210294197, "kl": 0.10675760302692652, "learning_rate": 9.068759265665382e-08, "loss": 0.004270961135625839, "reward": 2.965625, "reward_std": 0.3837372213602066, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1715625047683716, "rewards/MazeReward/std": 0.21898579895496367, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.8, "completions/mean_length": 121.6375, "completions/min_length": 48.8, "epoch": 13.974358974358974, "frac_reward_zero_std": 0.8875, "grad_norm": 0.9853878316290574, "kl": 0.10937570687383413, "learning_rate": 8.832751990773712e-08, "loss": 0.004375287890434265, "reward": 2.846875, "reward_std": 0.34098189175128935, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.15968750268220902, "rewards/MazeReward/std": 0.19211819767951965, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/mean_length": 128.046875, "completions/min_length": 48.0, "epoch": 14.017094017094017, "frac_reward_zero_std": 0.85, "grad_norm": 0.6220137988989953, "kl": 0.10704006981104612, "learning_rate": 8.599558442598998e-08, "loss": 0.004281745105981827, "reward": 3.50625, "reward_std": 0.5477135837078094, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22562500238418579, "rewards/MazeReward/std": 0.2416780710220337, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.4, "completions/mean_length": 118.753125, "completions/min_length": 49.6, "epoch": 14.05982905982906, "frac_reward_zero_std": 0.825, "grad_norm": 1.1807223503571544, "kl": 0.11027584793046116, "learning_rate": 8.369194559610481e-08, "loss": 0.004411678016185761, "reward": 3.28125, "reward_std": 0.5337654531002045, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20312500596046448, "rewards/MazeReward/std": 0.22464993894100188, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.2, "completions/mean_length": 121.665625, "completions/min_length": 51.2, "epoch": 14.102564102564102, "frac_reward_zero_std": 0.8875, "grad_norm": 0.6645196225456695, "kl": 0.11633407985791563, "learning_rate": 8.141676086873573e-08, "loss": 0.004653770476579666, "reward": 3.659375, "reward_std": 0.4128348171710968, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2409375011920929, "rewards/MazeReward/std": 0.26526184678077697, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.6, "completions/mean_length": 117.334375, "completions/min_length": 47.4, "epoch": 14.145299145299145, "frac_reward_zero_std": 0.9375, "grad_norm": 0.05438863761076759, "kl": 0.11503907395526766, "learning_rate": 7.917018574973644e-08, "loss": 0.004602273926138878, "reward": 3.1125, "reward_std": 0.1789622038602829, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1862500011920929, "rewards/MazeReward/std": 0.21130214631557465, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 121.6921875, "completions/min_length": 46.2, "epoch": 14.188034188034187, "frac_reward_zero_std": 0.8125, "grad_norm": 0.43688586874881835, "kl": 0.10734036625362933, "learning_rate": 7.695237378953224e-08, "loss": 0.004293971136212349, "reward": 3.446875, "reward_std": 0.6536878108978271, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21968749761581421, "rewards/MazeReward/std": 0.2504511445760727, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.4, "completions/mean_length": 123.55625, "completions/min_length": 47.2, "epoch": 14.23076923076923, "frac_reward_zero_std": 0.85, "grad_norm": 0.844809871553244, "kl": 0.10472417911514639, "learning_rate": 7.476347657262455e-08, "loss": 0.004189522564411163, "reward": 2.959375, "reward_std": 0.48555052280426025, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.17093749940395356, "rewards/MazeReward/std": 0.20453031957149506, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/mean_length": 127.9421875, "completions/min_length": 48.6, "epoch": 14.273504273504274, "frac_reward_zero_std": 0.875, "grad_norm": 0.522968792512682, "kl": 0.09963853806257247, "learning_rate": 7.260364370723043e-08, "loss": 0.0039858706295490265, "reward": 3.35, "reward_std": 0.4939745903015137, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21000000536441804, "rewards/MazeReward/std": 0.23409392535686493, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 700.4, "completions/mean_length": 134.346875, "completions/min_length": 47.8, "epoch": 14.316239316239317, "frac_reward_zero_std": 0.7625, "grad_norm": 1.0720480282615144, "kl": 0.10253564361482859, "learning_rate": 7.047302281505735e-08, "loss": 0.004102340340614319, "reward": 3.6546875, "reward_std": 0.7884458363056183, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.2406250059604645, "rewards/MazeReward/std": 0.2581137716770172, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.2, "completions/mean_length": 125.48125, "completions/min_length": 48.6, "epoch": 14.35897435897436, "frac_reward_zero_std": 0.8375, "grad_norm": 0.5287232895639771, "kl": 0.10590210733935237, "learning_rate": 6.837175952121304e-08, "loss": 0.004236004501581192, "reward": 3.190625, "reward_std": 0.5882582247257233, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19406250417232512, "rewards/MazeReward/std": 0.2406783401966095, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.4, "completions/mean_length": 126.9703125, "completions/min_length": 49.8, "epoch": 14.401709401709402, "frac_reward_zero_std": 0.7875, "grad_norm": 0.8930005912084705, "kl": 0.10905647352337837, "learning_rate": 6.629999744425235e-08, "loss": 0.004362818598747253, "reward": 3.359375, "reward_std": 0.7467323809862136, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2109375, "rewards/MazeReward/std": 0.25135611593723295, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/mean_length": 126.4796875, "completions/min_length": 48.0, "epoch": 14.444444444444445, "frac_reward_zero_std": 0.8375, "grad_norm": 0.8842969125336764, "kl": 0.10364811704494059, "learning_rate": 6.42578781863613e-08, "loss": 0.004146735370159149, "reward": 3.1625, "reward_std": 0.5814606785774231, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19125000834465028, "rewards/MazeReward/std": 0.23819169998168946, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.6, "completions/mean_length": 126.4609375, "completions/min_length": 52.4, "epoch": 14.487179487179487, "frac_reward_zero_std": 0.75, "grad_norm": 0.7308075090785238, "kl": 0.11490621510893106, "learning_rate": 6.22455413236786e-08, "loss": 0.004596540331840515, "reward": 3.821875, "reward_std": 0.9392133474349975, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2571874916553497, "rewards/MazeReward/std": 0.28829089999198915, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.2, "completions/mean_length": 123.490625, "completions/min_length": 46.8, "epoch": 14.52991452991453, "frac_reward_zero_std": 0.875, "grad_norm": 0.5203656707355103, "kl": 0.09874770562164485, "learning_rate": 6.026312439675551e-08, "loss": 0.003949685022234917, "reward": 3.175, "reward_std": 0.47585675716400144, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19250000417232513, "rewards/MazeReward/std": 0.2218406856060028, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.8, "completions/mean_length": 125.51875, "completions/min_length": 46.4, "epoch": 14.572649572649572, "frac_reward_zero_std": 0.875, "grad_norm": 0.6697032216645531, "kl": 0.10887362537905573, "learning_rate": 5.831076290115572e-08, "loss": 0.004355831816792488, "reward": 3.5125, "reward_std": 0.41589419543743134, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2262500047683716, "rewards/MazeReward/std": 0.24999509155750274, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/mean_length": 118.8578125, "completions/min_length": 48.0, "epoch": 14.615384615384615, "frac_reward_zero_std": 0.8875, "grad_norm": 0.7010706958799113, "kl": 0.11055122390389442, "learning_rate": 5.638859027819409e-08, "loss": 0.004422678798437119, "reward": 3.278125, "reward_std": 0.3859258651733398, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20281250178813934, "rewards/MazeReward/std": 0.23284580707550048, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.2, "completions/mean_length": 122.209375, "completions/min_length": 46.4, "epoch": 14.658119658119658, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6684899129688294, "kl": 0.10815434050746262, "learning_rate": 5.44967379058161e-08, "loss": 0.004326200857758522, "reward": 3.196875, "reward_std": 0.6069713115692139, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19468750357627868, "rewards/MazeReward/std": 0.23656201660633086, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/mean_length": 122.9078125, "completions/min_length": 48.6, "epoch": 14.7008547008547, "frac_reward_zero_std": 0.7375, "grad_norm": 0.7944424841316382, "kl": 0.10805548503994941, "learning_rate": 5.263533508961826e-08, "loss": 0.004322724044322967, "reward": 3.403125, "reward_std": 0.8790005326271058, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21531250178813935, "rewards/MazeReward/std": 0.27138482630252836, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.2, "completions/mean_length": 125.675, "completions/min_length": 47.0, "epoch": 14.743589743589745, "frac_reward_zero_std": 0.85, "grad_norm": 0.8311964411594716, "kl": 0.10785716827958822, "learning_rate": 5.080450905401057e-08, "loss": 0.0043143235146999356, "reward": 3.321875, "reward_std": 0.47051496505737306, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20718750655651091, "rewards/MazeReward/std": 0.2368873655796051, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.4, "completions/mean_length": 120.0015625, "completions/min_length": 48.2, "epoch": 14.786324786324787, "frac_reward_zero_std": 0.8, "grad_norm": 1.2412365855858876, "kl": 0.10242009852081538, "learning_rate": 4.9004384933520547e-08, "loss": 0.004096883535385132, "reward": 3.59375, "reward_std": 0.7541570663452148, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23437500298023223, "rewards/MazeReward/std": 0.2849434196949005, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.4, "completions/mean_length": 117.9265625, "completions/min_length": 48.4, "epoch": 14.82905982905983, "frac_reward_zero_std": 0.775, "grad_norm": 1.1105087434794185, "kl": 0.1089911924675107, "learning_rate": 4.723508576424062e-08, "loss": 0.004359452426433564, "reward": 3.5375, "reward_std": 0.7639246016740799, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22874999940395355, "rewards/MazeReward/std": 0.24069324731826783, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.2, "completions/mean_length": 122.603125, "completions/min_length": 50.2, "epoch": 14.871794871794872, "frac_reward_zero_std": 0.8, "grad_norm": 0.7388592958462435, "kl": 0.10980640817433596, "learning_rate": 4.549673247541874e-08, "loss": 0.004392564296722412, "reward": 3.678125, "reward_std": 0.7469538986682892, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.24281249642372132, "rewards/MazeReward/std": 0.2900205373764038, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.4, "completions/mean_length": 121.2125, "completions/min_length": 46.8, "epoch": 14.914529914529915, "frac_reward_zero_std": 0.9125, "grad_norm": 0.43934120657856396, "kl": 0.10940547166392207, "learning_rate": 4.37894438811931e-08, "loss": 0.004376126080751419, "reward": 3.409375, "reward_std": 0.2898747891187668, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21593750417232513, "rewards/MazeReward/std": 0.241091787815094, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.2, "completions/mean_length": 118.225, "completions/min_length": 48.0, "epoch": 14.957264957264957, "frac_reward_zero_std": 0.7625, "grad_norm": 1.187885559295768, "kl": 0.11253669201396406, "learning_rate": 4.2113336672471245e-08, "loss": 0.004501381888985634, "reward": 3.640625, "reward_std": 0.8781503081321717, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23906251192092895, "rewards/MazeReward/std": 0.2923721134662628, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.2, "completions/mean_length": 122.45625, "completions/min_length": 44.2, "epoch": 15.0, "frac_reward_zero_std": 0.7875, "grad_norm": 0.8092896972547012, "kl": 0.10736672207713127, "learning_rate": 4.0468525408954456e-08, "loss": 0.00429476797580719, "reward": 3.684375, "reward_std": 0.7190027356147766, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2434375047683716, "rewards/MazeReward/std": 0.2688789367675781, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/mean_length": 122.8734375, "completions/min_length": 44.6, "epoch": 15.042735042735043, "frac_reward_zero_std": 0.8875, "grad_norm": 0.5830990891113115, "kl": 0.11255799029022455, "learning_rate": 3.8855122511307626e-08, "loss": 0.004502619802951813, "reward": 3.46875, "reward_std": 0.33688003718853, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22187500596046447, "rewards/MazeReward/std": 0.24869490265846253, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 705.2, "completions/mean_length": 128.025, "completions/min_length": 45.0, "epoch": 15.085470085470085, "frac_reward_zero_std": 0.775, "grad_norm": 0.4328270413121731, "kl": 0.1091361996717751, "learning_rate": 3.727323825347578e-08, "loss": 0.004365786164999008, "reward": 4.0669921875, "reward_std": 0.756450217962265, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.28187501430511475, "rewards/MazeReward/std": 0.27423061430454254, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.6, "completions/mean_length": 119.975, "completions/min_length": 47.4, "epoch": 15.128205128205128, "frac_reward_zero_std": 0.8375, "grad_norm": 0.81299525416062, "kl": 0.10865907510742545, "learning_rate": 3.572298075514652e-08, "loss": 0.004346990585327148, "reward": 3.284375, "reward_std": 0.6671978831291199, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20343750417232515, "rewards/MazeReward/std": 0.24995078146457672, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.4, "completions/mean_length": 124.7359375, "completions/min_length": 50.6, "epoch": 15.17094017094017, "frac_reward_zero_std": 0.8875, "grad_norm": 1.1087066393589085, "kl": 0.1049537037499249, "learning_rate": 3.420445597436056e-08, "loss": 0.004198139905929566, "reward": 3.5125, "reward_std": 0.4084074795246124, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22625000774860382, "rewards/MazeReward/std": 0.2444426268339157, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.2, "completions/mean_length": 118.8203125, "completions/min_length": 47.2, "epoch": 15.213675213675213, "frac_reward_zero_std": 0.85, "grad_norm": 0.7614062817623753, "kl": 0.10607459908351302, "learning_rate": 3.271776770026963e-08, "loss": 0.004243284463882446, "reward": 3.178125, "reward_std": 0.4721804141998291, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19281249791383742, "rewards/MazeReward/std": 0.23438346683979033, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.2, "completions/mean_length": 122.6671875, "completions/min_length": 49.4, "epoch": 15.256410256410255, "frac_reward_zero_std": 0.8875, "grad_norm": 0.6127489399171744, "kl": 0.10655979104340077, "learning_rate": 3.1263017546042326e-08, "loss": 0.004262470826506615, "reward": 3.496875, "reward_std": 0.36138366907835007, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22468750476837157, "rewards/MazeReward/std": 0.23243287205696106, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.4, "completions/mean_length": 122.134375, "completions/min_length": 50.0, "epoch": 15.2991452991453, "frac_reward_zero_std": 0.9, "grad_norm": 0.967510995638688, "kl": 0.115428361389786, "learning_rate": 2.9840304941919416e-08, "loss": 0.004617030173540116, "reward": 3.2, "reward_std": 0.36295809745788576, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.1949999988079071, "rewards/MazeReward/std": 0.2410827934741974, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.2, "completions/mean_length": 124.8078125, "completions/min_length": 46.2, "epoch": 15.341880341880342, "frac_reward_zero_std": 0.8625, "grad_norm": 0.5796572302503418, "kl": 0.10713190361857414, "learning_rate": 2.8449727128417367e-08, "loss": 0.004285677149891853, "reward": 3.521875, "reward_std": 0.4946269363164902, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22718750834465026, "rewards/MazeReward/std": 0.26889726221561433, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.8, "completions/mean_length": 121.8359375, "completions/min_length": 49.2, "epoch": 15.384615384615385, "frac_reward_zero_std": 0.8375, "grad_norm": 0.05075725166593872, "kl": 0.0982246644794941, "learning_rate": 2.7091379149682682e-08, "loss": 0.0039296291768550875, "reward": 3.671875, "reward_std": 0.5362778604030609, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2421875089406967, "rewards/MazeReward/std": 0.2428381234407425, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.6, "completions/mean_length": 123.79375, "completions/min_length": 45.6, "epoch": 15.427350427350428, "frac_reward_zero_std": 0.875, "grad_norm": 0.7072293760700058, "kl": 0.11436028825119138, "learning_rate": 2.5765353846995297e-08, "loss": 0.004575078934431076, "reward": 3.53125, "reward_std": 0.45395667403936385, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22812499701976777, "rewards/MazeReward/std": 0.2579535335302353, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.8, "completions/mean_length": 120.3171875, "completions/min_length": 51.8, "epoch": 15.47008547008547, "frac_reward_zero_std": 0.8, "grad_norm": 1.1385185290789412, "kl": 0.11759204547852278, "learning_rate": 2.4471741852423233e-08, "loss": 0.0047042079269886015, "reward": 4.0, "reward_std": 0.6519853830337524, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2750000089406967, "rewards/MazeReward/std": 0.28275521993637087, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.2, "completions/mean_length": 122.0671875, "completions/min_length": 47.2, "epoch": 15.512820512820513, "frac_reward_zero_std": 0.85, "grad_norm": 0.9564171205522918, "kl": 0.10564614557661116, "learning_rate": 2.3210631582627927e-08, "loss": 0.004226198792457581, "reward": 3.725, "reward_std": 0.5720295608043671, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.24750000089406968, "rewards/MazeReward/std": 0.2586837708950043, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.2, "completions/mean_length": 121.2984375, "completions/min_length": 44.0, "epoch": 15.555555555555555, "frac_reward_zero_std": 0.825, "grad_norm": 1.2406529941181959, "kl": 0.10482329577207565, "learning_rate": 2.1982109232821176e-08, "loss": 0.004193376749753952, "reward": 3.56875, "reward_std": 0.5979138255119324, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2318750023841858, "rewards/MazeReward/std": 0.26024834215641024, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/mean_length": 120.415625, "completions/min_length": 44.6, "epoch": 15.598290598290598, "frac_reward_zero_std": 0.85, "grad_norm": 0.7017519406514044, "kl": 0.1080109877511859, "learning_rate": 2.0786258770873645e-08, "loss": 0.004319808259606361, "reward": 3.378125, "reward_std": 0.43282521367073057, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21281250417232514, "rewards/MazeReward/std": 0.2369197577238083, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.6, "completions/mean_length": 126.8125, "completions/min_length": 49.2, "epoch": 15.64102564102564, "frac_reward_zero_std": 0.875, "grad_norm": 0.6632277424765783, "kl": 0.10668526445515454, "learning_rate": 1.9623161931575926e-08, "loss": 0.004267299920320511, "reward": 3.559375, "reward_std": 0.5012161135673523, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23093750476837158, "rewards/MazeReward/std": 0.27313116788864134, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.8, "completions/mean_length": 120.003125, "completions/min_length": 47.6, "epoch": 15.683760683760683, "frac_reward_zero_std": 0.85, "grad_norm": 0.5135442974101795, "kl": 0.11582731227390468, "learning_rate": 1.849289821105199e-08, "loss": 0.004633168503642082, "reward": 3.34375, "reward_std": 0.4953270524740219, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.20937500894069672, "rewards/MazeReward/std": 0.255164110660553, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.4, "completions/mean_length": 119.6359375, "completions/min_length": 47.0, "epoch": 15.726495726495726, "frac_reward_zero_std": 0.8875, "grad_norm": 1.0016497259218642, "kl": 0.10574633032083511, "learning_rate": 1.7395544861325718e-08, "loss": 0.004230240359902382, "reward": 3.0966796875, "reward_std": 0.304620361328125, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.18468749821186065, "rewards/MazeReward/std": 0.21719041466712952, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.6, "completions/mean_length": 128.990625, "completions/min_length": 47.6, "epoch": 15.76923076923077, "frac_reward_zero_std": 0.825, "grad_norm": 0.5174454406155103, "kl": 0.10564749445766211, "learning_rate": 1.6331176885040876e-08, "loss": 0.004225829616189003, "reward": 3.515625, "reward_std": 0.6882634222507477, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22656250298023223, "rewards/MazeReward/std": 0.2732086151838303, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.8, "completions/mean_length": 123.2875, "completions/min_length": 48.2, "epoch": 15.811965811965813, "frac_reward_zero_std": 0.8, "grad_norm": 0.8255536120984532, "kl": 0.10030470197089017, "learning_rate": 1.5299867030334813e-08, "loss": 0.004012863337993622, "reward": 3.371875, "reward_std": 0.7521422803401947, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2121875047683716, "rewards/MazeReward/std": 0.2528801321983337, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.4, "completions/mean_length": 121.8, "completions/min_length": 50.2, "epoch": 15.854700854700855, "frac_reward_zero_std": 0.9125, "grad_norm": 0.812809914447486, "kl": 0.10551224481314421, "learning_rate": 1.4301685785866213e-08, "loss": 0.004221369326114654, "reward": 3.259375, "reward_std": 0.31027054488658906, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2009375125169754, "rewards/MazeReward/std": 0.2311701625585556, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.8, "completions/mean_length": 126.9515625, "completions/min_length": 47.0, "epoch": 15.897435897435898, "frac_reward_zero_std": 0.8125, "grad_norm": 1.1091420453814027, "kl": 0.10986831542104483, "learning_rate": 1.3336701375997127e-08, "loss": 0.004395525902509689, "reward": 4.075, "reward_std": 0.670307207107544, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2824999988079071, "rewards/MazeReward/std": 0.27602744698524473, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.2, "completions/mean_length": 127.453125, "completions/min_length": 49.8, "epoch": 15.94017094017094, "frac_reward_zero_std": 0.8625, "grad_norm": 0.8145535007785947, "kl": 0.10255064629018307, "learning_rate": 1.240497975613014e-08, "loss": 0.004101991280913353, "reward": 3.7375, "reward_std": 0.5223577737808227, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2487500011920929, "rewards/MazeReward/std": 0.2582669973373413, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.6, "completions/mean_length": 126.9578125, "completions/min_length": 46.6, "epoch": 15.982905982905983, "frac_reward_zero_std": 0.825, "grad_norm": 0.06424905878178337, "kl": 0.1100274601019919, "learning_rate": 1.1506584608200364e-08, "loss": 0.004401101171970368, "reward": 3.38125, "reward_std": 0.5968389272689819, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21312500834465026, "rewards/MazeReward/std": 0.2849972754716873, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.6, "completions/mean_length": 127.4640625, "completions/min_length": 49.2, "epoch": 16.025641025641026, "frac_reward_zero_std": 0.775, "grad_norm": 0.6487314480801746, "kl": 0.10238274387083948, "learning_rate": 1.0641577336322761e-08, "loss": 0.004095544293522835, "reward": 3.975, "reward_std": 0.8413453936576843, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2725000143051147, "rewards/MazeReward/std": 0.3040153205394745, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.6, "completions/mean_length": 124.6984375, "completions/min_length": 47.6, "epoch": 16.068376068376068, "frac_reward_zero_std": 0.875, "grad_norm": 0.49436098483755675, "kl": 0.10609784824773669, "learning_rate": 9.810017062595321e-09, "loss": 0.004243911057710647, "reward": 3.334375, "reward_std": 0.4042118787765503, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2084375023841858, "rewards/MazeReward/std": 0.2501231372356415, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/mean_length": 124.1859375, "completions/min_length": 50.0, "epoch": 16.11111111111111, "frac_reward_zero_std": 0.8, "grad_norm": 0.508436919192622, "kl": 0.10891414480283856, "learning_rate": 9.011960623058201e-09, "loss": 0.004356931149959564, "reward": 3.59375, "reward_std": 0.6314712464809418, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23437499403953552, "rewards/MazeReward/std": 0.27095602750778197, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.4, "completions/mean_length": 124.0859375, "completions/min_length": 47.0, "epoch": 16.153846153846153, "frac_reward_zero_std": 0.8625, "grad_norm": 0.6664563867287676, "kl": 0.10197192868217826, "learning_rate": 8.247462563808816e-09, "loss": 0.004078886285424232, "reward": 3.4625, "reward_std": 0.47230331152677535, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22125000655651092, "rewards/MazeReward/std": 0.2445346087217331, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.4, "completions/mean_length": 121.35, "completions/min_length": 49.4, "epoch": 16.196581196581196, "frac_reward_zero_std": 0.925, "grad_norm": 0.0535281517184304, "kl": 0.11215386167168617, "learning_rate": 7.516575137274162e-09, "loss": 0.004487024992704392, "reward": 3.478125, "reward_std": 0.344576370716095, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2228124976158142, "rewards/MazeReward/std": 0.24724717438220978, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.6, "completions/mean_length": 124.7515625, "completions/min_length": 49.8, "epoch": 16.23931623931624, "frac_reward_zero_std": 0.7875, "grad_norm": 0.7746570584609815, "kl": 0.11612661136314273, "learning_rate": 6.819348298638839e-09, "loss": 0.0046449493616819385, "reward": 3.440625, "reward_std": 0.7381389677524567, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2190625011920929, "rewards/MazeReward/std": 0.282903778553009, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.4, "completions/mean_length": 120.953125, "completions/min_length": 46.0, "epoch": 16.28205128205128, "frac_reward_zero_std": 0.9, "grad_norm": 0.6463953386999981, "kl": 0.10854203356429934, "learning_rate": 6.15582970243117e-09, "loss": 0.004342161864042282, "reward": 3.728125, "reward_std": 0.355919748544693, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.24781250953674316, "rewards/MazeReward/std": 0.2368593394756317, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.2, "completions/mean_length": 121.671875, "completions/min_length": 48.6, "epoch": 16.324786324786324, "frac_reward_zero_std": 0.775, "grad_norm": 1.1793486433508549, "kl": 0.11762102926149964, "learning_rate": 5.526064699265753e-09, "loss": 0.004704833775758743, "reward": 3.4875, "reward_std": 0.7744836688041687, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22375000417232513, "rewards/MazeReward/std": 0.27049732208251953, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/mean_length": 120.8703125, "completions/min_length": 44.2, "epoch": 16.367521367521366, "frac_reward_zero_std": 0.825, "grad_norm": 0.6733016986298439, "kl": 0.11788390032015741, "learning_rate": 4.9300963327441044e-09, "loss": 0.004716131463646888, "reward": 3.565625, "reward_std": 0.5362739384174346, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23156249821186065, "rewards/MazeReward/std": 0.2886486887931824, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.8, "completions/mean_length": 122.578125, "completions/min_length": 52.8, "epoch": 16.41025641025641, "frac_reward_zero_std": 0.775, "grad_norm": 1.0382370758710289, "kl": 0.11309504546225072, "learning_rate": 4.367965336512403e-09, "loss": 0.004523798450827598, "reward": 3.396875, "reward_std": 0.803810977935791, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21468749940395354, "rewards/MazeReward/std": 0.2716837853193283, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.6, "completions/mean_length": 122.0703125, "completions/min_length": 45.2, "epoch": 16.45299145299145, "frac_reward_zero_std": 0.825, "grad_norm": 0.8257139308848667, "kl": 0.10980163249187172, "learning_rate": 3.8397101314774915e-09, "loss": 0.004392998665571213, "reward": 3.95625, "reward_std": 0.5920416593551636, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.27062500417232516, "rewards/MazeReward/std": 0.28637204468250277, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 695.0, "completions/mean_length": 127.8203125, "completions/min_length": 46.8, "epoch": 16.495726495726494, "frac_reward_zero_std": 0.8625, "grad_norm": 1.0742469342385899, "kl": 0.10377971744164824, "learning_rate": 3.3453668231809283e-09, "loss": 0.004151855036616326, "reward": 3.6044921875, "reward_std": 0.4786112129688263, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 0.9984375, "rewards/MazeFormat/std": 0.01767766922712326, "rewards/MazeReward/mean": 0.23562500178813933, "rewards/MazeReward/std": 0.25146309435367586, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.4, "completions/mean_length": 120.9828125, "completions/min_length": 44.8, "epoch": 16.53846153846154, "frac_reward_zero_std": 0.8625, "grad_norm": 0.6118948764226286, "kl": 0.1021963557228446, "learning_rate": 2.8849691993311777e-09, "loss": 0.004088918119668961, "reward": 3.6875, "reward_std": 0.47385877966880796, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2437500089406967, "rewards/MazeReward/std": 0.23270266354084015, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.4, "completions/mean_length": 123.25625, "completions/min_length": 47.6, "epoch": 16.581196581196583, "frac_reward_zero_std": 0.7625, "grad_norm": 1.2784114354732232, "kl": 0.10859406501986087, "learning_rate": 2.458548727494292e-09, "loss": 0.004345090314745903, "reward": 3.659375, "reward_std": 0.7301452726125717, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.24093750715255738, "rewards/MazeReward/std": 0.2568228155374527, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.2, "completions/mean_length": 126.1, "completions/min_length": 48.8, "epoch": 16.623931623931625, "frac_reward_zero_std": 0.8625, "grad_norm": 0.9898730665558909, "kl": 0.10555869597010314, "learning_rate": 2.066134552943077e-09, "loss": 0.004222322627902031, "reward": 3.5125, "reward_std": 0.441130667924881, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22625000178813934, "rewards/MazeReward/std": 0.2487412929534912, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/mean_length": 119.5265625, "completions/min_length": 45.8, "epoch": 16.666666666666668, "frac_reward_zero_std": 0.85, "grad_norm": 1.0217738859538394, "kl": 0.0998825051356107, "learning_rate": 1.7077534966650765e-09, "loss": 0.0039948724210262295, "reward": 3.428125, "reward_std": 0.5493013352155686, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21781250238418579, "rewards/MazeReward/std": 0.21325217485427855, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.6, "completions/mean_length": 121.23125, "completions/min_length": 47.0, "epoch": 16.70940170940171, "frac_reward_zero_std": 0.825, "grad_norm": 0.5884618325504257, "kl": 0.10465789251029492, "learning_rate": 1.383430053529422e-09, "loss": 0.004186463728547096, "reward": 3.6060546875, "reward_std": 0.5122001999523491, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23562500774860382, "rewards/MazeReward/std": 0.26041761338710784, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.6, "completions/mean_length": 123.5296875, "completions/min_length": 46.0, "epoch": 16.752136752136753, "frac_reward_zero_std": 0.8, "grad_norm": 0.6015423582858093, "kl": 0.1133018615655601, "learning_rate": 1.0931863906127325e-09, "loss": 0.004532093182206154, "reward": 3.45, "reward_std": 0.6776583135128021, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.22000000178813933, "rewards/MazeReward/std": 0.27564533352851867, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.2, "completions/mean_length": 121.4421875, "completions/min_length": 43.2, "epoch": 16.794871794871796, "frac_reward_zero_std": 0.925, "grad_norm": 0.4046282144429942, "kl": 0.11728158215992153, "learning_rate": 8.370423456837139e-10, "loss": 0.0046912945806980135, "reward": 3.6125, "reward_std": 0.24019813239574433, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.23625001609325408, "rewards/MazeReward/std": 0.22056612372398376, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.6, "completions/mean_length": 122.6640625, "completions/min_length": 48.4, "epoch": 16.837606837606838, "frac_reward_zero_std": 0.8875, "grad_norm": 0.18848532642409926, "kl": 0.10375166512094439, "learning_rate": 6.150154258476314e-10, "loss": 0.004150073975324631, "reward": 3.2375, "reward_std": 0.39494048357009887, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.19874999821186065, "rewards/MazeReward/std": 0.23839934766292573, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/mean_length": 121.5640625, "completions/min_length": 47.2, "epoch": 16.88034188034188, "frac_reward_zero_std": 0.7875, "grad_norm": 1.0049912375133132, "kl": 0.10834184312261641, "learning_rate": 4.271208063494902e-10, "loss": 0.004333572089672088, "reward": 3.70625, "reward_std": 0.7638269186019897, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.2456250011920929, "rewards/MazeReward/std": 0.27377059757709504, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/mean_length": 122.1046875, "completions/min_length": 47.8, "epoch": 16.923076923076923, "frac_reward_zero_std": 0.8375, "grad_norm": 0.54479139113427, "kl": 0.10669711260125041, "learning_rate": 2.733713295369755e-10, "loss": 0.004268518462777138, "reward": 3.378125, "reward_std": 0.5172571033239365, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.21281250268220903, "rewards/MazeReward/std": 0.24584324657917023, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/mean_length": 124.6546875, "completions/min_length": 47.0, "epoch": 16.965811965811966, "frac_reward_zero_std": 0.8375, "grad_norm": 0.7059688953566956, "kl": 0.11410397617146373, "learning_rate": 1.53777503982655e-10, "loss": 0.004564845934510231, "reward": 3.671875, "reward_std": 0.5840405106544495, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.24218750596046448, "rewards/MazeReward/std": 0.2895630538463593, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/mean_length": 123.0890625, "completions/min_length": 50.6, "epoch": 17.00854700854701, "frac_reward_zero_std": 0.85, "grad_norm": 0.848786610345723, "kl": 0.11118856389075518, "learning_rate": 6.834750376549791e-11, "loss": 0.004448100179433823, "reward": 3.9060546875, "reward_std": 0.45252889334224167, "rewards/Format/mean": 0.2498046875, "rewards/Format/std": 0.0022097086533904076, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.265625, "rewards/MazeReward/std": 0.2678088635206223, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.2, "completions/mean_length": 121.8, "completions/min_length": 48.4, "epoch": 17.05128205128205, "frac_reward_zero_std": 0.7875, "grad_norm": 0.7312599594007181, "kl": 0.11210196618922055, "learning_rate": 1.7087167912710476e-11, "loss": 0.004484037682414055, "reward": 4.41875, "reward_std": 0.7629148185253143, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.31687500774860383, "rewards/MazeReward/std": 0.3001715898513794, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.4, "completions/mean_length": 118.578125, "completions/min_length": 46.0, "epoch": 17.094017094017094, "frac_reward_zero_std": 0.825, "grad_norm": 0.4560788089531848, "kl": 0.1063311716541648, "learning_rate": 0.0, "loss": 0.00425378829240799, "reward": 3.821875, "reward_std": 0.6984930455684661, "rewards/Format/mean": 0.25, "rewards/Format/std": 0.0, "rewards/MazeFormat/mean": 1.0, "rewards/MazeFormat/std": 0.0, "rewards/MazeReward/mean": 0.25718750059604645, "rewards/MazeReward/std": 0.28713129460811615, "step": 2000 } ], "logging_steps": 5, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 18, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }