{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500.0, "global_step": 294, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4758.0, "completions/mean_length": 1819.5078125, "completions/min_length": 508.0, "epoch": 0.003401360544217687, "frac_reward_zero_std": 0.0, "grad_norm": 0.12744246988948524, "kl": 0.0, "learning_rate": 1.6666666666666665e-07, "loss": -3.3527612686157227e-08, "num_turns": 2.0, "reward": 0.34577980637550354, "reward_std": 0.4898218512535095, "rewards/MLPCodeOnPolicy32BORM/mean": 0.34577980637550354, "rewards/MLPCodeOnPolicy32BORM/std": 0.49940040707588196, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5835.0, "completions/mean_length": 1820.296875, "completions/min_length": 640.0, "epoch": 0.006802721088435374, "frac_reward_zero_std": 0.0, "grad_norm": 0.1201274689767808, "kl": 0.0, "learning_rate": 3.333333333333333e-07, "loss": -9.033828973770142e-08, "num_turns": 2.0, "reward": 0.5030246376991272, "reward_std": 0.45653027296066284, "rewards/MLPCodeOnPolicy32BORM/mean": 0.503024697303772, "rewards/MLPCodeOnPolicy32BORM/std": 0.4683470129966736, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6221.0, "completions/mean_length": 1869.0234375, "completions/min_length": 625.0, "epoch": 0.01020408163265306, "frac_reward_zero_std": 0.0, "grad_norm": 0.13249851266483276, "kl": 5.7374833659196156e-05, "learning_rate": 5e-07, "loss": 1.6391277313232422e-07, "num_turns": 2.0, "reward": 0.4454033672809601, "reward_std": 0.4615671932697296, "rewards/MLPCodeOnPolicy32BORM/mean": 0.44540339708328247, "rewards/MLPCodeOnPolicy32BORM/std": 0.4942783713340759, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4441.0, "completions/mean_length": 1784.91015625, "completions/min_length": 804.0, "epoch": 0.013605442176870748, "frac_reward_zero_std": 0.0, "grad_norm": 0.12573995450643682, "kl": 5.082826066882262e-05, "learning_rate": 4.999854313415308e-07, "loss": 5.587935447692871e-08, "num_turns": 2.0, "reward": 0.5703479051589966, "reward_std": 0.36449864506721497, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5703479051589966, "rewards/MLPCodeOnPolicy32BORM/std": 0.3774077594280243, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5988.0, "completions/mean_length": 1741.33984375, "completions/min_length": 576.0, "epoch": 0.017006802721088437, "frac_reward_zero_std": 0.0, "grad_norm": 0.1373648881605736, "kl": 5.534328431622271e-05, "learning_rate": 4.999417270640898e-07, "loss": 6.332993507385254e-08, "num_turns": 2.0, "reward": 0.33942678570747375, "reward_std": 0.5093421936035156, "rewards/MLPCodeOnPolicy32BORM/mean": 0.33942678570747375, "rewards/MLPCodeOnPolicy32BORM/std": 0.5255665183067322, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6849.0, "completions/mean_length": 1795.203125, "completions/min_length": 579.0, "epoch": 0.02040816326530612, "frac_reward_zero_std": 0.0, "grad_norm": 0.12060672847616874, "kl": 4.775961713221477e-05, "learning_rate": 4.998688922613787e-07, "loss": 6.705522537231445e-08, "num_turns": 2.0, "reward": 0.4659126400947571, "reward_std": 0.4433140754699707, "rewards/MLPCodeOnPolicy32BORM/mean": 0.46591266989707947, "rewards/MLPCodeOnPolicy32BORM/std": 0.46012043952941895, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1754.07421875, "completions/min_length": 666.0, "epoch": 0.023809523809523808, "frac_reward_zero_std": 0.0, "grad_norm": 0.11995636331377409, "kl": 4.119202509400566e-05, "learning_rate": 4.997669354222401e-07, "loss": -1.4901161193847656e-08, "num_turns": 2.0, "reward": 0.5474852323532104, "reward_std": 0.34531816840171814, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5474852323532104, "rewards/MLPCodeOnPolicy32BORM/std": 0.37691301107406616, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3899.0, "completions/mean_length": 1698.69921875, "completions/min_length": 745.0, "epoch": 0.027210884353741496, "frac_reward_zero_std": 0.0, "grad_norm": 0.13098214536619826, "kl": 5.105331194954488e-05, "learning_rate": 4.996358684296693e-07, "loss": 8.754432201385498e-08, "num_turns": 2.0, "reward": 0.5156557559967041, "reward_std": 0.4053615927696228, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5156557559967041, "rewards/MLPCodeOnPolicy32BORM/std": 0.4166768491268158, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5423.0, "completions/mean_length": 1771.01953125, "completions/min_length": 692.0, "epoch": 0.030612244897959183, "frac_reward_zero_std": 0.0, "grad_norm": 0.12797291368590355, "kl": 5.446028086453225e-05, "learning_rate": 4.994757065594279e-07, "loss": -1.1175870895385742e-08, "num_turns": 2.0, "reward": 0.4673098921775818, "reward_std": 0.37777355313301086, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4673098921775818, "rewards/MLPCodeOnPolicy32BORM/std": 0.400544136762619, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1822.95703125, "completions/min_length": 688.0, "epoch": 0.034013605442176874, "frac_reward_zero_std": 0.0, "grad_norm": 0.12201373456484678, "kl": 4.741260931950819e-05, "learning_rate": 4.992864684782648e-07, "loss": 1.1641532182693481e-08, "num_turns": 2.0, "reward": 0.36175915598869324, "reward_std": 0.4400799870491028, "rewards/MLPCodeOnPolicy32BORM/mean": 0.36175912618637085, "rewards/MLPCodeOnPolicy32BORM/std": 0.44151124358177185, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4675.0, "completions/mean_length": 1797.35546875, "completions/min_length": 707.0, "epoch": 0.03741496598639456, "frac_reward_zero_std": 0.0, "grad_norm": 0.12392330273911538, "kl": 4.906168550178336e-05, "learning_rate": 4.9906817624174e-07, "loss": 4.7963112592697144e-08, "num_turns": 2.0, "reward": 0.4319838285446167, "reward_std": 0.4490795135498047, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4319838285446167, "rewards/MLPCodeOnPolicy32BORM/std": 0.477209210395813, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4950.0, "completions/mean_length": 1725.4921875, "completions/min_length": 631.0, "epoch": 0.04081632653061224, "frac_reward_zero_std": 0.0, "grad_norm": 0.13305305669597486, "kl": 5.819839469722865e-05, "learning_rate": 4.988208552916535e-07, "loss": 5.587935447692871e-08, "num_turns": 2.0, "reward": 0.3035702109336853, "reward_std": 0.4499426484107971, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3035701811313629, "rewards/MLPCodeOnPolicy32BORM/std": 0.4780357778072357, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1741.2734375, "completions/min_length": 597.0, "epoch": 0.04421768707482993, "frac_reward_zero_std": 0.0, "grad_norm": 0.13073505697177595, "kl": 5.914523239880509e-05, "learning_rate": 4.98544534453081e-07, "loss": 2.60770320892334e-08, "num_turns": 2.0, "reward": 0.3342248797416687, "reward_std": 0.5042227506637573, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3342248797416687, "rewards/MLPCodeOnPolicy32BORM/std": 0.5114654302597046, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3818.0, "completions/mean_length": 1577.34765625, "completions/min_length": 601.0, "epoch": 0.047619047619047616, "frac_reward_zero_std": 0.0, "grad_norm": 0.13342383735176347, "kl": 5.705263799882232e-05, "learning_rate": 4.98239245931014e-07, "loss": 1.4901161193847656e-08, "num_turns": 2.0, "reward": 0.5360592603683472, "reward_std": 0.3911046087741852, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5360592007637024, "rewards/MLPCodeOnPolicy32BORM/std": 0.4391091465950012, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4162.0, "completions/mean_length": 1765.125, "completions/min_length": 585.0, "epoch": 0.05102040816326531, "frac_reward_zero_std": 0.0, "grad_norm": 0.12774705064954062, "kl": 5.5505841203284945e-05, "learning_rate": 4.979050253066063e-07, "loss": 1.6763806343078613e-08, "num_turns": 2.0, "reward": 0.38956862688064575, "reward_std": 0.49937379360198975, "rewards/MLPCodeOnPolicy32BORM/mean": 0.38956862688064575, "rewards/MLPCodeOnPolicy32BORM/std": 0.48975542187690735, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5066.0, "completions/mean_length": 1762.51171875, "completions/min_length": 549.0, "epoch": 0.05442176870748299, "frac_reward_zero_std": 0.0, "grad_norm": 0.124432419027365, "kl": 5.724478234014896e-05, "learning_rate": 4.975419115330267e-07, "loss": 5.21540641784668e-08, "num_turns": 2.0, "reward": 0.4735792279243469, "reward_std": 0.46047526597976685, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4735792279243469, "rewards/MLPCodeOnPolicy32BORM/std": 0.4626067876815796, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4209.0, "completions/mean_length": 1677.42578125, "completions/min_length": 572.0, "epoch": 0.05782312925170068, "frac_reward_zero_std": 0.0, "grad_norm": 0.12903009670717827, "kl": 5.789786484911019e-05, "learning_rate": 4.971499469309197e-07, "loss": 2.2351741790771484e-08, "num_turns": 2.0, "reward": 0.35853224992752075, "reward_std": 0.4827578067779541, "rewards/MLPCodeOnPolicy32BORM/mean": 0.35853227972984314, "rewards/MLPCodeOnPolicy32BORM/std": 0.49547749757766724, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3885.0, "completions/mean_length": 1666.55078125, "completions/min_length": 696.0, "epoch": 0.061224489795918366, "frac_reward_zero_std": 0.0, "grad_norm": 0.12695146741790425, "kl": 6.25709384394213e-05, "learning_rate": 4.967291771834726e-07, "loss": 1.0244548320770264e-07, "num_turns": 2.0, "reward": 0.4618152379989624, "reward_std": 0.47193068265914917, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4618152379989624, "rewards/MLPCodeOnPolicy32BORM/std": 0.4882770776748657, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7093.0, "completions/mean_length": 1748.77734375, "completions/min_length": 520.0, "epoch": 0.06462585034013606, "frac_reward_zero_std": 0.0, "grad_norm": 0.12396913356347632, "kl": 5.553931009671942e-05, "learning_rate": 4.962796513310916e-07, "loss": 6.705522537231445e-08, "num_turns": 2.0, "reward": 0.39511638879776, "reward_std": 0.48685652017593384, "rewards/MLPCodeOnPolicy32BORM/mean": 0.39511638879776, "rewards/MLPCodeOnPolicy32BORM/std": 0.48999226093292236, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5014.0, "completions/mean_length": 1794.609375, "completions/min_length": 680.0, "epoch": 0.06802721088435375, "frac_reward_zero_std": 0.0, "grad_norm": 0.12764640221320528, "kl": 6.153282583909458e-05, "learning_rate": 4.958014217656854e-07, "loss": 7.450580596923828e-09, "num_turns": 2.0, "reward": 0.3481443524360657, "reward_std": 0.49198833107948303, "rewards/MLPCodeOnPolicy32BORM/mean": 0.34814438223838806, "rewards/MLPCodeOnPolicy32BORM/std": 0.48914164304733276, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3801.0, "completions/mean_length": 1760.40625, "completions/min_length": 660.0, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.12593800576982925, "kl": 6.070601955343591e-05, "learning_rate": 4.952945442245597e-07, "loss": 2.3655593395233154e-07, "num_turns": 2.0, "reward": 0.33063238859176636, "reward_std": 0.48998257517814636, "rewards/MLPCodeOnPolicy32BORM/mean": 0.33063238859176636, "rewards/MLPCodeOnPolicy32BORM/std": 0.501966118812561, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5325.0, "completions/mean_length": 1718.546875, "completions/min_length": 560.0, "epoch": 0.07482993197278912, "frac_reward_zero_std": 0.0, "grad_norm": 0.13000811471416104, "kl": 6.788247685562965e-05, "learning_rate": 4.947590777839208e-07, "loss": 1.2665987014770508e-07, "num_turns": 2.0, "reward": 0.39335066080093384, "reward_std": 0.49354591965675354, "rewards/MLPCodeOnPolicy32BORM/mean": 0.39335066080093384, "rewards/MLPCodeOnPolicy32BORM/std": 0.4913029074668884, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4239.0, "completions/mean_length": 1644.57421875, "completions/min_length": 532.0, "epoch": 0.0782312925170068, "frac_reward_zero_std": 0.0, "grad_norm": 0.1306033618236287, "kl": 6.613362518237409e-05, "learning_rate": 4.941950848519903e-07, "loss": -3.725290298461914e-09, "num_turns": 2.0, "reward": 0.4827514886856079, "reward_std": 0.47881123423576355, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4827515184879303, "rewards/MLPCodeOnPolicy32BORM/std": 0.47407078742980957, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4696.0, "completions/mean_length": 1832.0859375, "completions/min_length": 771.0, "epoch": 0.08163265306122448, "frac_reward_zero_std": 0.0, "grad_norm": 0.12365620608315255, "kl": 6.383584820923716e-05, "learning_rate": 4.936026311617316e-07, "loss": 5.029141902923584e-08, "num_turns": 2.0, "reward": 0.36635422706604004, "reward_std": 0.4653104245662689, "rewards/MLPCodeOnPolicy32BORM/mean": 0.36635422706604004, "rewards/MLPCodeOnPolicy32BORM/std": 0.4677668511867523, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5258.0, "completions/mean_length": 1764.4765625, "completions/min_length": 668.0, "epoch": 0.08503401360544217, "frac_reward_zero_std": 0.0, "grad_norm": 0.12675449588257132, "kl": 7.160034897424339e-05, "learning_rate": 4.92981785763188e-07, "loss": 2.7939677238464355e-08, "num_turns": 2.0, "reward": 0.3244081139564514, "reward_std": 0.4639052748680115, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3244081437587738, "rewards/MLPCodeOnPolicy32BORM/std": 0.4596010744571686, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4426.0, "completions/mean_length": 1652.34375, "completions/min_length": 724.0, "epoch": 0.08843537414965986, "frac_reward_zero_std": 0.0, "grad_norm": 0.13265008990734048, "kl": 6.956292804716213e-05, "learning_rate": 4.923326210154364e-07, "loss": 1.862645149230957e-08, "num_turns": 2.0, "reward": 0.5124205350875854, "reward_std": 0.4297581613063812, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5124205350875854, "rewards/MLPCodeOnPolicy32BORM/std": 0.44342154264450073, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5193.0, "completions/mean_length": 1714.58203125, "completions/min_length": 669.0, "epoch": 0.09183673469387756, "frac_reward_zero_std": 0.0, "grad_norm": 0.13091734914956202, "kl": 6.905263046519394e-05, "learning_rate": 4.916552125781528e-07, "loss": 2.2351741790771484e-08, "num_turns": 2.0, "reward": 0.40827929973602295, "reward_std": 0.5019749402999878, "rewards/MLPCodeOnPolicy32BORM/mean": 0.40827929973602295, "rewards/MLPCodeOnPolicy32BORM/std": 0.49093514680862427, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5548.0, "completions/mean_length": 1680.0234375, "completions/min_length": 727.0, "epoch": 0.09523809523809523, "frac_reward_zero_std": 0.0, "grad_norm": 0.12975055686418424, "kl": 7.241910009270214e-05, "learning_rate": 4.909496394027944e-07, "loss": 1.7508864402770996e-07, "num_turns": 2.0, "reward": 0.464019238948822, "reward_std": 0.46882909536361694, "rewards/MLPCodeOnPolicy32BORM/mean": 0.464019238948822, "rewards/MLPCodeOnPolicy32BORM/std": 0.4726778268814087, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7339.0, "completions/mean_length": 1737.5703125, "completions/min_length": 550.0, "epoch": 0.09863945578231292, "frac_reward_zero_std": 0.0, "grad_norm": 0.12663487691300868, "kl": 7.121374130747427e-05, "learning_rate": 4.902159837233984e-07, "loss": 8.568167686462402e-08, "num_turns": 2.0, "reward": 0.3782956600189209, "reward_std": 0.5221479535102844, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3782956600189209, "rewards/MLPCodeOnPolicy32BORM/std": 0.5246500968933105, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4165.0, "completions/mean_length": 1711.59765625, "completions/min_length": 574.0, "epoch": 0.10204081632653061, "frac_reward_zero_std": 0.0, "grad_norm": 0.13567792821976454, "kl": 7.378515601885738e-05, "learning_rate": 4.894543310469967e-07, "loss": 1.1734664440155029e-07, "num_turns": 2.0, "reward": 0.4272693991661072, "reward_std": 0.5026066899299622, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4272693693637848, "rewards/MLPCodeOnPolicy32BORM/std": 0.5159730911254883, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6039.0, "completions/mean_length": 1723.34765625, "completions/min_length": 658.0, "epoch": 0.1054421768707483, "frac_reward_zero_std": 0.0, "grad_norm": 0.1220857505904424, "kl": 7.243182494676148e-05, "learning_rate": 4.886647701436513e-07, "loss": 7.636845111846924e-08, "num_turns": 2.0, "reward": 0.43645182251930237, "reward_std": 0.4528648257255554, "rewards/MLPCodeOnPolicy32BORM/mean": 0.43645179271698, "rewards/MLPCodeOnPolicy32BORM/std": 0.4559268057346344, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4791.0, "completions/mean_length": 1782.40625, "completions/min_length": 754.0, "epoch": 0.10884353741496598, "frac_reward_zero_std": 0.0, "grad_norm": 0.12742842792578463, "kl": 7.836230156499369e-05, "learning_rate": 4.878473930361071e-07, "loss": 5.494803190231323e-08, "num_turns": 2.0, "reward": 0.4406846761703491, "reward_std": 0.47518694400787354, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4406846761703491, "rewards/MLPCodeOnPolicy32BORM/std": 0.489067018032074, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7646.0, "completions/mean_length": 1724.53125, "completions/min_length": 596.0, "epoch": 0.11224489795918367, "frac_reward_zero_std": 0.0, "grad_norm": 0.12769787449282893, "kl": 7.334854217333486e-05, "learning_rate": 4.870022949890676e-07, "loss": 6.05359673500061e-08, "num_turns": 2.0, "reward": 0.38206565380096436, "reward_std": 0.4586794376373291, "rewards/MLPCodeOnPolicy32BORM/mean": 0.38206565380096436, "rewards/MLPCodeOnPolicy32BORM/std": 0.46659746766090393, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4114.0, "completions/mean_length": 1700.08203125, "completions/min_length": 646.0, "epoch": 0.11564625850340136, "frac_reward_zero_std": 0.0, "grad_norm": 0.1292062463672102, "kl": 7.518993618305103e-05, "learning_rate": 4.861295744980913e-07, "loss": 8.940696716308594e-08, "num_turns": 2.0, "reward": 0.4325985312461853, "reward_std": 0.40971022844314575, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4325985312461853, "rewards/MLPCodeOnPolicy32BORM/std": 0.4278082549571991, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4858.0, "completions/mean_length": 1868.4921875, "completions/min_length": 547.0, "epoch": 0.11904761904761904, "frac_reward_zero_std": 0.0, "grad_norm": 0.12345204127675038, "kl": 7.450408247677842e-05, "learning_rate": 4.852293332781124e-07, "loss": 5.727633833885193e-08, "num_turns": 2.0, "reward": 0.35513925552368164, "reward_std": 0.44654712080955505, "rewards/MLPCodeOnPolicy32BORM/mean": 0.35513922572135925, "rewards/MLPCodeOnPolicy32BORM/std": 0.45481741428375244, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1713.609375, "completions/min_length": 561.0, "epoch": 0.12244897959183673, "frac_reward_zero_std": 0.0, "grad_norm": 0.12510015631970176, "kl": 7.508149332124958e-05, "learning_rate": 4.843016762515859e-07, "loss": 8.940696716308594e-08, "num_turns": 2.0, "reward": 0.40725451707839966, "reward_std": 0.4564969539642334, "rewards/MLPCodeOnPolicy32BORM/mean": 0.40725451707839966, "rewards/MLPCodeOnPolicy32BORM/std": 0.46071183681488037, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3718.0, "completions/mean_length": 1622.859375, "completions/min_length": 480.0, "epoch": 0.12585034013605442, "frac_reward_zero_std": 0.0, "grad_norm": 0.1286319398978756, "kl": 7.390474104340683e-05, "learning_rate": 4.833467115362589e-07, "loss": 1.1548399925231934e-07, "num_turns": 2.0, "reward": 0.47519996762275696, "reward_std": 0.3644979000091553, "rewards/MLPCodeOnPolicy32BORM/mean": 0.47519993782043457, "rewards/MLPCodeOnPolicy32BORM/std": 0.380415678024292, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4462.0, "completions/mean_length": 1670.49609375, "completions/min_length": 571.0, "epoch": 0.1292517006802721, "frac_reward_zero_std": 0.0, "grad_norm": 0.13038416386547164, "kl": 7.876049130572937e-05, "learning_rate": 4.823645504325699e-07, "loss": 7.450580596923828e-08, "num_turns": 2.0, "reward": 0.3584836721420288, "reward_std": 0.4540232717990875, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3584836721420288, "rewards/MLPCodeOnPolicy32BORM/std": 0.44441646337509155, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5251.0, "completions/mean_length": 1839.046875, "completions/min_length": 490.0, "epoch": 0.1326530612244898, "frac_reward_zero_std": 0.0, "grad_norm": 0.12463579695824678, "kl": 7.707584961735847e-05, "learning_rate": 4.81355307410676e-07, "loss": 7.82310962677002e-08, "num_turns": 2.0, "reward": 0.41247132420539856, "reward_std": 0.4671492278575897, "rewards/MLPCodeOnPolicy32BORM/mean": 0.41247132420539856, "rewards/MLPCodeOnPolicy32BORM/std": 0.4722498953342438, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7413.0, "completions/mean_length": 1708.32421875, "completions/min_length": 477.0, "epoch": 0.1360544217687075, "frac_reward_zero_std": 0.0, "grad_norm": 0.13875688765149183, "kl": 8.656998204514821e-05, "learning_rate": 4.803191000971128e-07, "loss": -3.725290298461914e-09, "num_turns": 2.0, "reward": 0.36401766538619995, "reward_std": 0.4898770749568939, "rewards/MLPCodeOnPolicy32BORM/mean": 0.36401766538619995, "rewards/MLPCodeOnPolicy32BORM/std": 0.48293355107307434, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5456.0, "completions/mean_length": 1664.0078125, "completions/min_length": 694.0, "epoch": 0.13945578231292516, "frac_reward_zero_std": 0.0, "grad_norm": 0.12606923287028104, "kl": 8.074963511717215e-05, "learning_rate": 4.792560492610835e-07, "loss": 7.450580596923828e-08, "num_turns": 2.0, "reward": 0.4370952248573303, "reward_std": 0.3985515832901001, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4370952248573303, "rewards/MLPCodeOnPolicy32BORM/std": 0.39930838346481323, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7162.0, "completions/mean_length": 1715.57421875, "completions/min_length": 603.0, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.1349781738713447, "kl": 8.651554469452094e-05, "learning_rate": 4.78166278800385e-07, "loss": 1.043081283569336e-07, "num_turns": 2.0, "reward": 0.36490052938461304, "reward_std": 0.45700886845588684, "rewards/MLPCodeOnPolicy32BORM/mean": 0.36490052938461304, "rewards/MLPCodeOnPolicy32BORM/std": 0.4708576202392578, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1832.96484375, "completions/min_length": 748.0, "epoch": 0.14625850340136054, "frac_reward_zero_std": 0.0, "grad_norm": 0.12229902259500831, "kl": 7.565521184460522e-05, "learning_rate": 4.770499157269663e-07, "loss": -3.4458935260772705e-08, "num_turns": 2.0, "reward": 0.47392046451568604, "reward_std": 0.4232255220413208, "rewards/MLPCodeOnPolicy32BORM/mean": 0.47392046451568604, "rewards/MLPCodeOnPolicy32BORM/std": 0.4284113347530365, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6068.0, "completions/mean_length": 1806.5859375, "completions/min_length": 665.0, "epoch": 0.14965986394557823, "frac_reward_zero_std": 0.0, "grad_norm": 0.12303543586950182, "kl": 7.862668167035736e-05, "learning_rate": 4.7590709015212633e-07, "loss": 6.146728992462158e-08, "num_turns": 2.0, "reward": 0.14448022842407227, "reward_std": 0.5132663249969482, "rewards/MLPCodeOnPolicy32BORM/mean": 0.14448022842407227, "rewards/MLPCodeOnPolicy32BORM/std": 0.5187545418739319, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1684.65234375, "completions/min_length": 656.0, "epoch": 0.15306122448979592, "frac_reward_zero_std": 0.0, "grad_norm": 0.12705093094712636, "kl": 8.699895329300489e-05, "learning_rate": 4.747379352713488e-07, "loss": 8.940696716308594e-08, "num_turns": 2.0, "reward": 0.43057385087013245, "reward_std": 0.42482852935791016, "rewards/MLPCodeOnPolicy32BORM/mean": 0.43057382106781006, "rewards/MLPCodeOnPolicy32BORM/std": 0.4353935718536377, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5181.0, "completions/mean_length": 1747.23046875, "completions/min_length": 646.0, "epoch": 0.1564625850340136, "frac_reward_zero_std": 0.0, "grad_norm": 0.13013278675105897, "kl": 8.532286244644638e-05, "learning_rate": 4.7354258734877906e-07, "loss": 7.82310962677002e-08, "num_turns": 2.0, "reward": 0.21112340688705444, "reward_std": 0.49027031660079956, "rewards/MLPCodeOnPolicy32BORM/mean": 0.21112340688705444, "rewards/MLPCodeOnPolicy32BORM/std": 0.5018395781517029, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5155.0, "completions/mean_length": 1782.39453125, "completions/min_length": 626.0, "epoch": 0.1598639455782313, "frac_reward_zero_std": 0.0, "grad_norm": 0.12503700814380458, "kl": 8.357773197076312e-05, "learning_rate": 4.7232118570134227e-07, "loss": 2.9802322387695312e-08, "num_turns": 2.0, "reward": 0.46107158064842224, "reward_std": 0.4947185516357422, "rewards/MLPCodeOnPolicy32BORM/mean": 0.46107155084609985, "rewards/MLPCodeOnPolicy32BORM/std": 0.4803822934627533, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3971.0, "completions/mean_length": 1709.5, "completions/min_length": 567.0, "epoch": 0.16326530612244897, "frac_reward_zero_std": 0.0, "grad_norm": 0.12837085947616933, "kl": 8.580618555242836e-05, "learning_rate": 4.7107387268250586e-07, "loss": 6.705522537231445e-08, "num_turns": 2.0, "reward": 0.39588984847068787, "reward_std": 0.4714891314506531, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3958898186683655, "rewards/MLPCodeOnPolicy32BORM/std": 0.46815717220306396, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5086.0, "completions/mean_length": 1820.640625, "completions/min_length": 791.0, "epoch": 0.16666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.12510105746824587, "kl": 8.930987769417698e-05, "learning_rate": 4.69800793665689e-07, "loss": 1.1548399925231934e-07, "num_turns": 2.0, "reward": 0.28018033504486084, "reward_std": 0.4455254077911377, "rewards/MLPCodeOnPolicy32BORM/mean": 0.28018033504486084, "rewards/MLPCodeOnPolicy32BORM/std": 0.46793651580810547, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1757.30078125, "completions/min_length": 697.0, "epoch": 0.17006802721088435, "frac_reward_zero_std": 0.0, "grad_norm": 0.1205305455949019, "kl": 9.023273048569536e-05, "learning_rate": 4.685020970273189e-07, "loss": 1.0803341865539551e-07, "num_turns": 2.0, "reward": 0.34440556168556213, "reward_std": 0.42845094203948975, "rewards/MLPCodeOnPolicy32BORM/mean": 0.34440553188323975, "rewards/MLPCodeOnPolicy32BORM/std": 0.42874884605407715, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4636.0, "completions/mean_length": 1805.30859375, "completions/min_length": 759.0, "epoch": 0.17346938775510204, "frac_reward_zero_std": 0.0, "grad_norm": 0.12358348328235372, "kl": 9.223375809597201e-05, "learning_rate": 4.6717793412953776e-07, "loss": 1.4528632164001465e-07, "num_turns": 2.0, "reward": 0.35937339067459106, "reward_std": 0.45837679505348206, "rewards/MLPCodeOnPolicy32BORM/mean": 0.35937339067459106, "rewards/MLPCodeOnPolicy32BORM/std": 0.45718511939048767, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4918.0, "completions/mean_length": 1776.66015625, "completions/min_length": 468.0, "epoch": 0.17687074829931973, "frac_reward_zero_std": 0.0, "grad_norm": 0.12023113902964513, "kl": 9.91547349258326e-05, "learning_rate": 4.6582845930256166e-07, "loss": 6.146728992462158e-08, "num_turns": 2.0, "reward": 0.4752587080001831, "reward_std": 0.46407151222229004, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4752587080001831, "rewards/MLPCodeOnPolicy32BORM/std": 0.4792989492416382, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4763.0, "completions/mean_length": 1784.984375, "completions/min_length": 632.0, "epoch": 0.18027210884353742, "frac_reward_zero_std": 0.0, "grad_norm": 0.12500207873670774, "kl": 0.00010303906310582533, "learning_rate": 4.6445382982669354e-07, "loss": 8.940696716308594e-08, "num_turns": 2.0, "reward": 0.31531599164009094, "reward_std": 0.46134036779403687, "rewards/MLPCodeOnPolicy32BORM/mean": 0.31531602144241333, "rewards/MLPCodeOnPolicy32BORM/std": 0.47073715925216675, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7204.0, "completions/mean_length": 1856.4453125, "completions/min_length": 538.0, "epoch": 0.1836734693877551, "frac_reward_zero_std": 0.0, "grad_norm": 0.12421921518476847, "kl": 9.920350612446782e-05, "learning_rate": 4.630542059139923e-07, "loss": 6.705522537231445e-08, "num_turns": 2.0, "reward": 0.33992254734039307, "reward_std": 0.5055447220802307, "rewards/MLPCodeOnPolicy32BORM/mean": 0.33992254734039307, "rewards/MLPCodeOnPolicy32BORM/std": 0.4955473840236664, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1686.87890625, "completions/min_length": 479.0, "epoch": 0.1870748299319728, "frac_reward_zero_std": 0.0, "grad_norm": 0.1254120719869612, "kl": 0.00011915153527297662, "learning_rate": 4.616297506896001e-07, "loss": 1.4901161193847656e-07, "num_turns": 2.0, "reward": 0.3778541684150696, "reward_std": 0.47446173429489136, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3778541386127472, "rewards/MLPCodeOnPolicy32BORM/std": 0.48409169912338257, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4574.0, "completions/mean_length": 1665.82421875, "completions/min_length": 562.0, "epoch": 0.19047619047619047, "frac_reward_zero_std": 0.0, "grad_norm": 0.12973197352263433, "kl": 0.00011406978615013941, "learning_rate": 4.601806301727302e-07, "loss": 1.043081283569336e-07, "num_turns": 2.0, "reward": 0.44780969619750977, "reward_std": 0.4106203019618988, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4478096663951874, "rewards/MLPCodeOnPolicy32BORM/std": 0.43979954719543457, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6027.0, "completions/mean_length": 1682.44921875, "completions/min_length": 520.0, "epoch": 0.19387755102040816, "frac_reward_zero_std": 0.0, "grad_norm": 0.13270967450989135, "kl": 0.00012001413165307895, "learning_rate": 4.5870701325731773e-07, "loss": 6.705522537231445e-08, "num_turns": 2.0, "reward": 0.49996864795684814, "reward_std": 0.4304153323173523, "rewards/MLPCodeOnPolicy32BORM/mean": 0.49996861815452576, "rewards/MLPCodeOnPolicy32BORM/std": 0.44452470541000366, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1712.70703125, "completions/min_length": 597.0, "epoch": 0.19727891156462585, "frac_reward_zero_std": 0.0, "grad_norm": 0.12258262023187534, "kl": 0.00011655640764729469, "learning_rate": 4.572090716923353e-07, "loss": 1.1175870895385742e-07, "num_turns": 2.0, "reward": 0.4167475700378418, "reward_std": 0.42564165592193604, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4167475700378418, "rewards/MLPCodeOnPolicy32BORM/std": 0.45184454321861267, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4757.0, "completions/mean_length": 1797.17578125, "completions/min_length": 614.0, "epoch": 0.20068027210884354, "frac_reward_zero_std": 0.0, "grad_norm": 0.12136217537860085, "kl": 0.00010487847112017334, "learning_rate": 4.556869800617753e-07, "loss": 6.705522537231445e-08, "num_turns": 2.0, "reward": 0.4988711476325989, "reward_std": 0.4274623990058899, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4988711476325989, "rewards/MLPCodeOnPolicy32BORM/std": 0.43597930669784546, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6339.0, "completions/mean_length": 1699.640625, "completions/min_length": 574.0, "epoch": 0.20408163265306123, "frac_reward_zero_std": 0.0, "grad_norm": 0.12800442809508575, "kl": 0.00012965655946572952, "learning_rate": 4.541409157643027e-07, "loss": 1.210719347000122e-07, "num_turns": 2.0, "reward": 0.47402942180633545, "reward_std": 0.48247969150543213, "rewards/MLPCodeOnPolicy32BORM/mean": 0.47402939200401306, "rewards/MLPCodeOnPolicy32BORM/std": 0.4953751862049103, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6665.0, "completions/mean_length": 1608.78515625, "completions/min_length": 572.0, "epoch": 0.20748299319727892, "frac_reward_zero_std": 0.0, "grad_norm": 0.13210996972657596, "kl": 0.00013131504510965897, "learning_rate": 4.5257105899257937e-07, "loss": 2.0302832126617432e-07, "num_turns": 2.0, "reward": 0.47677308320999146, "reward_std": 0.464009165763855, "rewards/MLPCodeOnPolicy32BORM/mean": 0.47677308320999146, "rewards/MLPCodeOnPolicy32BORM/std": 0.4753335416316986, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4159.0, "completions/mean_length": 1727.4140625, "completions/min_length": 599.0, "epoch": 0.2108843537414966, "frac_reward_zero_std": 0.0, "grad_norm": 0.1276960509567404, "kl": 0.000136644269559838, "learning_rate": 4.5097759271226247e-07, "loss": 1.1734664440155029e-07, "num_turns": 2.0, "reward": 0.4505724310874939, "reward_std": 0.4149358868598938, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4505724310874939, "rewards/MLPCodeOnPolicy32BORM/std": 0.4140785038471222, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5320.0, "completions/mean_length": 1669.0546875, "completions/min_length": 527.0, "epoch": 0.21428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.13006286691884608, "kl": 0.0001404530455602071, "learning_rate": 4.4936070264068016e-07, "loss": 1.3131648302078247e-07, "num_turns": 2.0, "reward": 0.4712015986442566, "reward_std": 0.43090948462486267, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4712015986442566, "rewards/MLPCodeOnPolicy32BORM/std": 0.43964844942092896, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6049.0, "completions/mean_length": 1694.3203125, "completions/min_length": 589.0, "epoch": 0.21768707482993196, "frac_reward_zero_std": 0.0, "grad_norm": 0.1233139379942691, "kl": 0.0001400133287461358, "learning_rate": 4.477205772251864e-07, "loss": 1.862645149230957e-07, "num_turns": 2.0, "reward": 0.45948171615600586, "reward_std": 0.385934054851532, "rewards/MLPCodeOnPolicy32BORM/mean": 0.45948168635368347, "rewards/MLPCodeOnPolicy32BORM/std": 0.4270806610584259, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3535.0, "completions/mean_length": 1640.0234375, "completions/min_length": 534.0, "epoch": 0.22108843537414966, "frac_reward_zero_std": 0.0, "grad_norm": 0.13305210668094364, "kl": 0.0001618890607915091, "learning_rate": 4.4605740762119726e-07, "loss": 2.4028122425079346e-07, "num_turns": 2.0, "reward": 0.5881233811378479, "reward_std": 0.3828655481338501, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5881233811378479, "rewards/MLPCodeOnPolicy32BORM/std": 0.38702020049095154, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6117.0, "completions/mean_length": 1656.7265625, "completions/min_length": 551.0, "epoch": 0.22448979591836735, "frac_reward_zero_std": 0.0, "grad_norm": 0.12517742318338876, "kl": 0.00014426143570744898, "learning_rate": 4.443713876699123e-07, "loss": 8.754432201385498e-08, "num_turns": 2.0, "reward": 0.4771096706390381, "reward_std": 0.4157760739326477, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4771096706390381, "rewards/MLPCodeOnPolicy32BORM/std": 0.45321065187454224, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7229.0, "completions/mean_length": 1641.0703125, "completions/min_length": 445.0, "epoch": 0.22789115646258504, "frac_reward_zero_std": 0.0, "grad_norm": 0.13614948043808453, "kl": 0.00016474105905217584, "learning_rate": 4.426627138757223e-07, "loss": 1.8812716007232666e-07, "num_turns": 2.0, "reward": 0.5012058019638062, "reward_std": 0.4257659912109375, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5012058019638062, "rewards/MLPCodeOnPolicy32BORM/std": 0.435712069272995, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5974.0, "completions/mean_length": 1757.94921875, "completions/min_length": 642.0, "epoch": 0.23129251700680273, "frac_reward_zero_std": 0.0, "grad_norm": 0.13400741095314625, "kl": 0.00016489506492689543, "learning_rate": 4.409315853833067e-07, "loss": 1.564621925354004e-07, "num_turns": 2.0, "reward": 0.3585292398929596, "reward_std": 0.4674282670021057, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3585292100906372, "rewards/MLPCodeOnPolicy32BORM/std": 0.4627254009246826, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1691.52734375, "completions/min_length": 555.0, "epoch": 0.23469387755102042, "frac_reward_zero_std": 0.0, "grad_norm": 0.13189680063936968, "kl": 0.00018260819911120052, "learning_rate": 4.391782039544238e-07, "loss": 2.1979212760925293e-07, "num_turns": 2.0, "reward": 0.38688501715660095, "reward_std": 0.44273632764816284, "rewards/MLPCodeOnPolicy32BORM/mean": 0.38688501715660095, "rewards/MLPCodeOnPolicy32BORM/std": 0.44483256340026855, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5656.0, "completions/mean_length": 1771.47265625, "completions/min_length": 577.0, "epoch": 0.23809523809523808, "frac_reward_zero_std": 0.0, "grad_norm": 0.11942098230903217, "kl": 0.00015966588580340613, "learning_rate": 4.374027739443952e-07, "loss": 1.9744038581848145e-07, "num_turns": 2.0, "reward": 0.4191434979438782, "reward_std": 0.39943796396255493, "rewards/MLPCodeOnPolicy32BORM/mean": 0.41914352774620056, "rewards/MLPCodeOnPolicy32BORM/std": 0.4300435185432434, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5968.0, "completions/mean_length": 1812.03125, "completions/min_length": 576.0, "epoch": 0.24149659863945577, "frac_reward_zero_std": 0.0, "grad_norm": 0.12415689131892878, "kl": 0.00016537675492145354, "learning_rate": 4.3560550227828834e-07, "loss": 1.7508864402770996e-07, "num_turns": 2.0, "reward": 0.5332323312759399, "reward_std": 0.3747295141220093, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5332322716712952, "rewards/MLPCodeOnPolicy32BORM/std": 0.3812965452671051, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5582.0, "completions/mean_length": 1716.89453125, "completions/min_length": 589.0, "epoch": 0.24489795918367346, "frac_reward_zero_std": 0.0, "grad_norm": 0.1249995771267063, "kl": 0.0001809181378575886, "learning_rate": 4.337865984268001e-07, "loss": 1.9371509552001953e-07, "num_turns": 2.0, "reward": 0.43725067377090454, "reward_std": 0.42010611295700073, "rewards/MLPCodeOnPolicy32BORM/mean": 0.43725070357322693, "rewards/MLPCodeOnPolicy32BORM/std": 0.43573060631752014, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7308.0, "completions/mean_length": 1728.125, "completions/min_length": 579.0, "epoch": 0.24829931972789115, "frac_reward_zero_std": 0.0, "grad_norm": 0.12553968210728977, "kl": 0.00020021227760480542, "learning_rate": 4.3194627438184233e-07, "loss": 1.7508864402770996e-07, "num_turns": 2.0, "reward": 0.4539065957069397, "reward_std": 0.46171799302101135, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4539065957069397, "rewards/MLPCodeOnPolicy32BORM/std": 0.457562118768692, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5294.0, "completions/mean_length": 1772.98828125, "completions/min_length": 695.0, "epoch": 0.25170068027210885, "frac_reward_zero_std": 0.0, "grad_norm": 0.11900417608479627, "kl": 0.00018830790418178367, "learning_rate": 4.3008474463183496e-07, "loss": 2.086162567138672e-07, "num_turns": 2.0, "reward": 0.4869869351387024, "reward_std": 0.4232376515865326, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4869869649410248, "rewards/MLPCodeOnPolicy32BORM/std": 0.4423135221004486, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1740.9140625, "completions/min_length": 516.0, "epoch": 0.25510204081632654, "frac_reward_zero_std": 0.0, "grad_norm": 0.1256588784763905, "kl": 0.00020819194855903334, "learning_rate": 4.282022261367073e-07, "loss": 2.2351741790771484e-07, "num_turns": 2.0, "reward": 0.48140135407447815, "reward_std": 0.40134385228157043, "rewards/MLPCodeOnPolicy32BORM/mean": 0.48140132427215576, "rewards/MLPCodeOnPolicy32BORM/std": 0.44396087527275085, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5386.0, "completions/mean_length": 1720.82421875, "completions/min_length": 624.0, "epoch": 0.2585034013605442, "frac_reward_zero_std": 0.0, "grad_norm": 0.1261532919992208, "kl": 0.00020027517689413799, "learning_rate": 4.262989383026114e-07, "loss": 2.4400651454925537e-07, "num_turns": 2.0, "reward": 0.33791497349739075, "reward_std": 0.47338148951530457, "rewards/MLPCodeOnPolicy32BORM/mean": 0.33791500329971313, "rewards/MLPCodeOnPolicy32BORM/std": 0.4666896462440491, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5895.0, "completions/mean_length": 1673.328125, "completions/min_length": 458.0, "epoch": 0.2619047619047619, "frac_reward_zero_std": 0.0, "grad_norm": 0.1281227994019189, "kl": 0.0002022001203840773, "learning_rate": 4.243751029563507e-07, "loss": 1.6391277313232422e-07, "num_turns": 2.0, "reward": 0.49508824944496155, "reward_std": 0.42326685786247253, "rewards/MLPCodeOnPolicy32BORM/mean": 0.49508827924728394, "rewards/MLPCodeOnPolicy32BORM/std": 0.44494879245758057, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1793.4453125, "completions/min_length": 737.0, "epoch": 0.2653061224489796, "frac_reward_zero_std": 0.0, "grad_norm": 0.12400128210142895, "kl": 0.00018887607643591764, "learning_rate": 4.2243094431952607e-07, "loss": 1.993030309677124e-07, "num_turns": 2.0, "reward": 0.3935734033584595, "reward_std": 0.4245133399963379, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3935733735561371, "rewards/MLPCodeOnPolicy32BORM/std": 0.4357367157936096, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5328.0, "completions/mean_length": 1590.703125, "completions/min_length": 621.0, "epoch": 0.2687074829931973, "frac_reward_zero_std": 0.0, "grad_norm": 0.1276264683063012, "kl": 0.0002454274231240561, "learning_rate": 4.2046668898240296e-07, "loss": 2.1792948246002197e-07, "num_turns": 2.0, "reward": 0.4903767704963684, "reward_std": 0.41225385665893555, "rewards/MLPCodeOnPolicy32BORM/mean": 0.49037671089172363, "rewards/MLPCodeOnPolicy32BORM/std": 0.4092792570590973, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5005.0, "completions/mean_length": 1648.64453125, "completions/min_length": 449.0, "epoch": 0.272108843537415, "frac_reward_zero_std": 0.0, "grad_norm": 0.13886289723410575, "kl": 0.0002334505954877386, "learning_rate": 4.184825658775027e-07, "loss": 1.993030309677124e-07, "num_turns": 2.0, "reward": 0.4053865075111389, "reward_std": 0.40530964732170105, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4053865373134613, "rewards/MLPCodeOnPolicy32BORM/std": 0.4158993363380432, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4601.0, "completions/mean_length": 1696.8515625, "completions/min_length": 533.0, "epoch": 0.2755102040816326, "frac_reward_zero_std": 0.0, "grad_norm": 0.12564175760915286, "kl": 0.00022438414066527912, "learning_rate": 4.1647880625292027e-07, "loss": 2.3562461137771606e-07, "num_turns": 2.0, "reward": 0.4553804397583008, "reward_std": 0.37617209553718567, "rewards/MLPCodeOnPolicy32BORM/mean": 0.45538046956062317, "rewards/MLPCodeOnPolicy32BORM/std": 0.3870598375797272, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5275.0, "completions/mean_length": 1695.30078125, "completions/min_length": 468.0, "epoch": 0.2789115646258503, "frac_reward_zero_std": 0.0, "grad_norm": 0.12220072275161292, "kl": 0.00022701559555571293, "learning_rate": 4.1445564364537266e-07, "loss": 1.9371509552001953e-07, "num_turns": 2.0, "reward": 0.5103209614753723, "reward_std": 0.3658777177333832, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5103209018707275, "rewards/MLPCodeOnPolicy32BORM/std": 0.38078218698501587, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6187.0, "completions/mean_length": 1759.6953125, "completions/min_length": 416.0, "epoch": 0.282312925170068, "frac_reward_zero_std": 0.0, "grad_norm": 0.1282708212338203, "kl": 0.00020942796845702105, "learning_rate": 4.124133138529803e-07, "loss": 2.1141022443771362e-07, "num_turns": 2.0, "reward": 0.3109988272190094, "reward_std": 0.452178031206131, "rewards/MLPCodeOnPolicy32BORM/mean": 0.310998797416687, "rewards/MLPCodeOnPolicy32BORM/std": 0.4474020004272461, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1670.4296875, "completions/min_length": 514.0, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.1303929574716558, "kl": 0.0002468315587975667, "learning_rate": 4.1035205490778496e-07, "loss": 3.0919909477233887e-07, "num_turns": 2.0, "reward": 0.5012885332107544, "reward_std": 0.499253511428833, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5012885332107544, "rewards/MLPCodeOnPolicy32BORM/std": 0.4859963655471802, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6447.0, "completions/mean_length": 1774.4921875, "completions/min_length": 523.0, "epoch": 0.2891156462585034, "frac_reward_zero_std": 0.0, "grad_norm": 0.1321627252917682, "kl": 0.00023074077262208448, "learning_rate": 4.0827210704800745e-07, "loss": 1.043081283569336e-07, "num_turns": 2.0, "reward": 0.4319390058517456, "reward_std": 0.48060840368270874, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4319390058517456, "rewards/MLPCodeOnPolicy32BORM/std": 0.49155551195144653, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4836.0, "completions/mean_length": 1665.26171875, "completions/min_length": 549.0, "epoch": 0.2925170068027211, "frac_reward_zero_std": 0.0, "grad_norm": 0.12803690948396568, "kl": 0.00025092134728765814, "learning_rate": 4.061737126900478e-07, "loss": 3.8929283618927e-07, "num_turns": 2.0, "reward": 0.5961266160011292, "reward_std": 0.36096709966659546, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5961266160011292, "rewards/MLPCodeOnPolicy32BORM/std": 0.39821961522102356, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6643.0, "completions/mean_length": 1680.0703125, "completions/min_length": 548.0, "epoch": 0.29591836734693877, "frac_reward_zero_std": 0.0, "grad_norm": 0.1290457837944881, "kl": 0.00026406801862322027, "learning_rate": 4.040571164002318e-07, "loss": 2.7194619178771973e-07, "num_turns": 2.0, "reward": 0.3678485155105591, "reward_std": 0.4080837368965149, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3678485155105591, "rewards/MLPCodeOnPolicy32BORM/std": 0.4086749851703644, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4514.0, "completions/mean_length": 1608.74609375, "completions/min_length": 561.0, "epoch": 0.29931972789115646, "frac_reward_zero_std": 0.0, "grad_norm": 0.12673992847722323, "kl": 0.00025539301213939325, "learning_rate": 4.019225648663072e-07, "loss": 2.551823854446411e-07, "num_turns": 2.0, "reward": 0.5111918449401855, "reward_std": 0.3585626482963562, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5111918449401855, "rewards/MLPCodeOnPolicy32BORM/std": 0.3727354109287262, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6841.0, "completions/mean_length": 1656.125, "completions/min_length": 541.0, "epoch": 0.30272108843537415, "frac_reward_zero_std": 0.0, "grad_norm": 0.1275437551932911, "kl": 0.00023101651822798885, "learning_rate": 3.997703068686923e-07, "loss": 2.123415470123291e-07, "num_turns": 2.0, "reward": 0.4530450105667114, "reward_std": 0.40583527088165283, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4530450105667114, "rewards/MLPCodeOnPolicy32BORM/std": 0.4039011299610138, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5366.0, "completions/mean_length": 1626.859375, "completions/min_length": 506.0, "epoch": 0.30612244897959184, "frac_reward_zero_std": 0.0, "grad_norm": 0.12855829116477677, "kl": 0.00025528713968014927, "learning_rate": 3.9760059325148063e-07, "loss": 1.30385160446167e-07, "num_turns": 2.0, "reward": 0.4176099896430969, "reward_std": 0.43359875679016113, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4176099896430969, "rewards/MLPCodeOnPolicy32BORM/std": 0.44678640365600586, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5738.0, "completions/mean_length": 1691.70703125, "completions/min_length": 483.0, "epoch": 0.30952380952380953, "frac_reward_zero_std": 0.0, "grad_norm": 0.12543869877212768, "kl": 0.00025521306588416337, "learning_rate": 3.954136768932056e-07, "loss": 2.8032809495925903e-07, "num_turns": 2.0, "reward": 0.554349422454834, "reward_std": 0.38532671332359314, "rewards/MLPCodeOnPolicy32BORM/mean": 0.554349422454834, "rewards/MLPCodeOnPolicy32BORM/std": 0.4123691916465759, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6134.0, "completions/mean_length": 1643.35546875, "completions/min_length": 573.0, "epoch": 0.3129251700680272, "frac_reward_zero_std": 0.0, "grad_norm": 0.13128657513735853, "kl": 0.00027815132762043504, "learning_rate": 3.932098126773674e-07, "loss": 2.3096799850463867e-07, "num_turns": 2.0, "reward": 0.4468424320220947, "reward_std": 0.3695867657661438, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4468424320220947, "rewards/MLPCodeOnPolicy32BORM/std": 0.37673676013946533, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5820.0, "completions/mean_length": 1658.8046875, "completions/min_length": 500.0, "epoch": 0.3163265306122449, "frac_reward_zero_std": 0.0, "grad_norm": 0.1232640314353311, "kl": 0.0002867808389055426, "learning_rate": 3.909892574627266e-07, "loss": 3.203749656677246e-07, "num_turns": 2.0, "reward": 0.431984543800354, "reward_std": 0.4377307593822479, "rewards/MLPCodeOnPolicy32BORM/mean": 0.431984543800354, "rewards/MLPCodeOnPolicy32BORM/std": 0.4598510265350342, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5530.0, "completions/mean_length": 1767.25390625, "completions/min_length": 391.0, "epoch": 0.3197278911564626, "frac_reward_zero_std": 0.0, "grad_norm": 0.13394884598636536, "kl": 0.00028032092995999847, "learning_rate": 3.887522700533675e-07, "loss": 2.4028122425079346e-07, "num_turns": 2.0, "reward": 0.45087701082229614, "reward_std": 0.4690535068511963, "rewards/MLPCodeOnPolicy32BORM/mean": 0.45087701082229614, "rewards/MLPCodeOnPolicy32BORM/std": 0.491955041885376, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6639.0, "completions/mean_length": 1665.76171875, "completions/min_length": 687.0, "epoch": 0.3231292517006803, "frac_reward_zero_std": 0.0, "grad_norm": 0.12812015596809412, "kl": 0.0002728628523982479, "learning_rate": 3.864991111685345e-07, "loss": 2.7939677238464355e-07, "num_turns": 2.0, "reward": 0.48972389101982117, "reward_std": 0.4347324073314667, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4897238612174988, "rewards/MLPCodeOnPolicy32BORM/std": 0.43040478229522705, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1657.66015625, "completions/min_length": 563.0, "epoch": 0.32653061224489793, "frac_reward_zero_std": 0.0, "grad_norm": 0.12430643043240865, "kl": 0.00028989990596528514, "learning_rate": 3.8423004341224595e-07, "loss": 2.7567148208618164e-07, "num_turns": 2.0, "reward": 0.33566781878471375, "reward_std": 0.5082817077636719, "rewards/MLPCodeOnPolicy32BORM/mean": 0.33566781878471375, "rewards/MLPCodeOnPolicy32BORM/std": 0.49739500880241394, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4475.0, "completions/mean_length": 1591.48046875, "completions/min_length": 546.0, "epoch": 0.3299319727891156, "frac_reward_zero_std": 0.0, "grad_norm": 0.1262900325387896, "kl": 0.00032333704984921496, "learning_rate": 3.819453312426871e-07, "loss": 2.644956111907959e-07, "num_turns": 2.0, "reward": 0.3694228529930115, "reward_std": 0.4831184148788452, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3694228529930115, "rewards/MLPCodeOnPolicy32BORM/std": 0.500664472579956, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3747.0, "completions/mean_length": 1610.03515625, "completions/min_length": 546.0, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 0.1258432953697267, "kl": 0.0002983086321819428, "learning_rate": 3.796452409413887e-07, "loss": 3.0547380447387695e-07, "num_turns": 2.0, "reward": 0.5325152277946472, "reward_std": 0.4058530330657959, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5325151681900024, "rewards/MLPCodeOnPolicy32BORM/std": 0.41554924845695496, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4152.0, "completions/mean_length": 1535.42578125, "completions/min_length": 470.0, "epoch": 0.336734693877551, "frac_reward_zero_std": 0.0, "grad_norm": 0.13426108423653652, "kl": 0.0003142357822980557, "learning_rate": 3.773300405821908e-07, "loss": 4.10713255405426e-07, "num_turns": 2.0, "reward": 0.5315067768096924, "reward_std": 0.37495577335357666, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5315067768096924, "rewards/MLPCodeOnPolicy32BORM/std": 0.38837048411369324, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4829.0, "completions/mean_length": 1651.390625, "completions/min_length": 535.0, "epoch": 0.3401360544217687, "frac_reward_zero_std": 0.0, "grad_norm": 0.1304295648701895, "kl": 0.000332591352162126, "learning_rate": 3.75e-07, "loss": 3.501772880554199e-07, "num_turns": 2.0, "reward": 0.3912454843521118, "reward_std": 0.45414209365844727, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3912455141544342, "rewards/MLPCodeOnPolicy32BORM/std": 0.4752326011657715, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4701.0, "completions/mean_length": 1614.3359375, "completions/min_length": 570.0, "epoch": 0.3435374149659864, "frac_reward_zero_std": 0.0, "grad_norm": 0.1245550118423448, "kl": 0.00038185133644219604, "learning_rate": 3.726553907593401e-07, "loss": 3.594905138015747e-07, "num_turns": 2.0, "reward": 0.42279961705207825, "reward_std": 0.3496551215648651, "rewards/MLPCodeOnPolicy32BORM/mean": 0.42279961705207825, "rewards/MLPCodeOnPolicy32BORM/std": 0.35374781489372253, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1682.421875, "completions/min_length": 566.0, "epoch": 0.3469387755102041, "frac_reward_zero_std": 0.0625, "grad_norm": 0.12739944992531263, "kl": 0.0003275773358382139, "learning_rate": 3.7029648612270123e-07, "loss": 3.2782554626464844e-07, "num_turns": 2.0, "reward": 0.2956712245941162, "reward_std": 0.4861956238746643, "rewards/MLPCodeOnPolicy32BORM/mean": 0.2956712245941162, "rewards/MLPCodeOnPolicy32BORM/std": 0.5181453227996826, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 8193.0, "completions/mean_length": 1673.75390625, "completions/min_length": 562.0, "epoch": 0.35034013605442177, "frac_reward_zero_std": 0.0, "grad_norm": 0.13445060320474667, "kl": 0.000384508166462183, "learning_rate": 3.6792356101869156e-07, "loss": 3.725290298461914e-07, "num_turns": 2.0, "reward": 0.3457567095756531, "reward_std": 0.39632177352905273, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3457567095756531, "rewards/MLPCodeOnPolicy32BORM/std": 0.41148948669433594, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1538.5703125, "completions/min_length": 458.0, "epoch": 0.35374149659863946, "frac_reward_zero_std": 0.0, "grad_norm": 0.13289507816296844, "kl": 0.00042833897032323875, "learning_rate": 3.655368920099942e-07, "loss": 3.7997961044311523e-07, "num_turns": 2.0, "reward": 0.4717833995819092, "reward_std": 0.3555891215801239, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4717833995819092, "rewards/MLPCodeOnPolicy32BORM/std": 0.3742328882217407, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5154.0, "completions/mean_length": 1680.16015625, "completions/min_length": 502.0, "epoch": 0.35714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.12610684349294984, "kl": 0.00036339681264507817, "learning_rate": 3.6313675726113475e-07, "loss": 3.650784492492676e-07, "num_turns": 2.0, "reward": 0.42708680033683777, "reward_std": 0.4701390862464905, "rewards/MLPCodeOnPolicy32BORM/mean": 0.42708683013916016, "rewards/MLPCodeOnPolicy32BORM/std": 0.47297847270965576, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4806.0, "completions/mean_length": 1588.52734375, "completions/min_length": 433.0, "epoch": 0.36054421768707484, "frac_reward_zero_std": 0.0, "grad_norm": 0.12977436985513188, "kl": 0.0004353736439952627, "learning_rate": 3.607234365060604e-07, "loss": 3.5390257835388184e-07, "num_turns": 2.0, "reward": 0.49587902426719666, "reward_std": 0.3839839696884155, "rewards/MLPCodeOnPolicy32BORM/mean": 0.49587899446487427, "rewards/MLPCodeOnPolicy32BORM/std": 0.40533238649368286, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6636.0, "completions/mean_length": 1518.19140625, "completions/min_length": 507.0, "epoch": 0.36394557823129253, "frac_reward_zero_std": 0.0, "grad_norm": 0.13218204603131029, "kl": 0.00048701832611186546, "learning_rate": 3.5829721101553826e-07, "loss": 5.308538675308228e-07, "num_turns": 2.0, "reward": 0.41638830304145813, "reward_std": 0.38539189100265503, "rewards/MLPCodeOnPolicy32BORM/mean": 0.41638830304145813, "rewards/MLPCodeOnPolicy32BORM/std": 0.3954349458217621, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6301.0, "completions/mean_length": 1585.0234375, "completions/min_length": 454.0, "epoch": 0.3673469387755102, "frac_reward_zero_std": 0.0, "grad_norm": 0.12737274117239786, "kl": 0.0004323695679886441, "learning_rate": 3.558583635643726e-07, "loss": 4.507601261138916e-07, "num_turns": 2.0, "reward": 0.5039672255516052, "reward_std": 0.401162713766098, "rewards/MLPCodeOnPolicy32BORM/mean": 0.50396728515625, "rewards/MLPCodeOnPolicy32BORM/std": 0.4077160656452179, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1631.3125, "completions/min_length": 535.0, "epoch": 0.3707482993197279, "frac_reward_zero_std": 0.0, "grad_norm": 0.12547409000395443, "kl": 0.00047672909840912325, "learning_rate": 3.5340717839844787e-07, "loss": 4.246830940246582e-07, "num_turns": 2.0, "reward": 0.4858195185661316, "reward_std": 0.3277161717414856, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4858194887638092, "rewards/MLPCodeOnPolicy32BORM/std": 0.3596702814102173, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4351.0, "completions/mean_length": 1497.08984375, "completions/min_length": 647.0, "epoch": 0.3741496598639456, "frac_reward_zero_std": 0.0, "grad_norm": 0.13036809035265715, "kl": 0.0005725711534978473, "learning_rate": 3.509439412016004e-07, "loss": 5.327165126800537e-07, "num_turns": 2.0, "reward": 0.41480642557144165, "reward_std": 0.4249509572982788, "rewards/MLPCodeOnPolicy32BORM/mean": 0.41480645537376404, "rewards/MLPCodeOnPolicy32BORM/std": 0.445372611284256, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4360.0, "completions/mean_length": 1546.18359375, "completions/min_length": 463.0, "epoch": 0.37755102040816324, "frac_reward_zero_std": 0.0, "grad_norm": 0.13263776844617658, "kl": 0.0005119868446854525, "learning_rate": 3.484689390623218e-07, "loss": 5.178153514862061e-07, "num_turns": 2.0, "reward": 0.39183998107910156, "reward_std": 0.4916958510875702, "rewards/MLPCodeOnPolicy32BORM/mean": 0.39183998107910156, "rewards/MLPCodeOnPolicy32BORM/std": 0.4971933960914612, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6743.0, "completions/mean_length": 1607.42578125, "completions/min_length": 533.0, "epoch": 0.38095238095238093, "frac_reward_zero_std": 0.0, "grad_norm": 0.1335978649489609, "kl": 0.0005102338586766564, "learning_rate": 3.4598246044029906e-07, "loss": 5.513429641723633e-07, "num_turns": 2.0, "reward": 0.3588225543498993, "reward_std": 0.4509512782096863, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3588225841522217, "rewards/MLPCodeOnPolicy32BORM/std": 0.48127850890159607, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4036.0, "completions/mean_length": 1477.58203125, "completions/min_length": 466.0, "epoch": 0.3843537414965986, "frac_reward_zero_std": 0.0, "grad_norm": 0.12933390137700235, "kl": 0.000587871592870215, "learning_rate": 3.4348479513279484e-07, "loss": 5.550682544708252e-07, "num_turns": 2.0, "reward": 0.5935378074645996, "reward_std": 0.33069849014282227, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5935378074645996, "rewards/MLPCodeOnPolicy32BORM/std": 0.3454819619655609, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3709.0, "completions/mean_length": 1475.26171875, "completions/min_length": 436.0, "epoch": 0.3877551020408163, "frac_reward_zero_std": 0.0, "grad_norm": 0.13991771662886215, "kl": 0.0006190207309373363, "learning_rate": 3.409762342408719e-07, "loss": 6.48200511932373e-07, "num_turns": 2.0, "reward": 0.5287143588066101, "reward_std": 0.4257807731628418, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5287143588066101, "rewards/MLPCodeOnPolicy32BORM/std": 0.43025365471839905, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1525.3359375, "completions/min_length": 594.0, "epoch": 0.391156462585034, "frac_reward_zero_std": 0.0, "grad_norm": 0.12956428174558993, "kl": 0.0006674773485428886, "learning_rate": 3.384570701354652e-07, "loss": 6.612390279769897e-07, "num_turns": 2.0, "reward": 0.4876652956008911, "reward_std": 0.40975645184516907, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4876652956008911, "rewards/MLPCodeOnPolicy32BORM/std": 0.4222400188446045, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4185.0, "completions/mean_length": 1450.0, "completions/min_length": 421.0, "epoch": 0.3945578231292517, "frac_reward_zero_std": 0.0, "grad_norm": 0.13705414529406734, "kl": 0.0007245506872095575, "learning_rate": 3.359275964233066e-07, "loss": 7.245689630508423e-07, "num_turns": 2.0, "reward": 0.507308840751648, "reward_std": 0.45902687311172485, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5073089003562927, "rewards/MLPCodeOnPolicy32BORM/std": 0.4505447745323181, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4866.0, "completions/mean_length": 1473.921875, "completions/min_length": 538.0, "epoch": 0.3979591836734694, "frac_reward_zero_std": 0.0, "grad_norm": 0.13099605307064782, "kl": 0.0007409593117699842, "learning_rate": 3.3338810791270517e-07, "loss": 6.780028343200684e-07, "num_turns": 2.0, "reward": 0.3987925350666046, "reward_std": 0.4523935317993164, "rewards/MLPCodeOnPolicy32BORM/mean": 0.398792564868927, "rewards/MLPCodeOnPolicy32BORM/std": 0.4649738073348999, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3860.0, "completions/mean_length": 1428.5625, "completions/min_length": 446.0, "epoch": 0.4013605442176871, "frac_reward_zero_std": 0.0, "grad_norm": 0.12534450760531446, "kl": 0.0008648029947835312, "learning_rate": 3.308389005791871e-07, "loss": 9.611248970031738e-07, "num_turns": 2.0, "reward": 0.604638934135437, "reward_std": 0.3078837990760803, "rewards/MLPCodeOnPolicy32BORM/mean": 0.604638934135437, "rewards/MLPCodeOnPolicy32BORM/std": 0.33188706636428833, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1439.98046875, "completions/min_length": 430.0, "epoch": 0.40476190476190477, "frac_reward_zero_std": 0.0, "grad_norm": 0.12123023441006979, "kl": 0.0009042763977049617, "learning_rate": 3.282802715310006e-07, "loss": 9.08970832824707e-07, "num_turns": 2.0, "reward": 0.46023768186569214, "reward_std": 0.3499431610107422, "rewards/MLPCodeOnPolicy32BORM/mean": 0.46023768186569214, "rewards/MLPCodeOnPolicy32BORM/std": 0.3669500946998596, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3627.0, "completions/mean_length": 1412.05078125, "completions/min_length": 601.0, "epoch": 0.40816326530612246, "frac_reward_zero_std": 0.0, "grad_norm": 0.12231666388502688, "kl": 0.0009289461663684051, "learning_rate": 3.2571251897448763e-07, "loss": 9.201467037200928e-07, "num_turns": 2.0, "reward": 0.44291263818740845, "reward_std": 0.4349104166030884, "rewards/MLPCodeOnPolicy32BORM/mean": 0.44291260838508606, "rewards/MLPCodeOnPolicy32BORM/std": 0.44798025488853455, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4455.0, "completions/mean_length": 1373.9921875, "completions/min_length": 417.0, "epoch": 0.41156462585034015, "frac_reward_zero_std": 0.0, "grad_norm": 0.129259993408259, "kl": 0.0010929724148809328, "learning_rate": 3.2313594217932854e-07, "loss": 1.1213123798370361e-06, "num_turns": 2.0, "reward": 0.3926813304424286, "reward_std": 0.4589789807796478, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3926813304424286, "rewards/MLPCodeOnPolicy32BORM/std": 0.4817292094230652, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1317.17578125, "completions/min_length": 514.0, "epoch": 0.41496598639455784, "frac_reward_zero_std": 0.0, "grad_norm": 0.13232964107425263, "kl": 0.001064850624970859, "learning_rate": 3.205508414436619e-07, "loss": 1.1045485734939575e-06, "num_turns": 2.0, "reward": 0.47658634185791016, "reward_std": 0.46201151609420776, "rewards/MLPCodeOnPolicy32BORM/mean": 0.47658634185791016, "rewards/MLPCodeOnPolicy32BORM/std": 0.4703359305858612, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5202.0, "completions/mean_length": 1406.23828125, "completions/min_length": 566.0, "epoch": 0.41836734693877553, "frac_reward_zero_std": 0.0, "grad_norm": 0.1272398524800165, "kl": 0.0010337179801354068, "learning_rate": 3.179575180590857e-07, "loss": 1.0048970580101013e-06, "num_turns": 2.0, "reward": 0.5132541656494141, "reward_std": 0.39524388313293457, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5132542252540588, "rewards/MLPCodeOnPolicy32BORM/std": 0.41758498549461365, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4192.0, "completions/mean_length": 1318.7265625, "completions/min_length": 481.0, "epoch": 0.4217687074829932, "frac_reward_zero_std": 0.0, "grad_norm": 0.1322752900393093, "kl": 0.0011480792186375766, "learning_rate": 3.153562742755414e-07, "loss": 1.0300427675247192e-06, "num_turns": 2.0, "reward": 0.5350508689880371, "reward_std": 0.37076908349990845, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5350508689880371, "rewards/MLPCodeOnPolicy32BORM/std": 0.39439699053764343, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3802.0, "completions/mean_length": 1236.74609375, "completions/min_length": 491.0, "epoch": 0.42517006802721086, "frac_reward_zero_std": 0.0, "grad_norm": 0.1269238046945135, "kl": 0.001386871756039909, "learning_rate": 3.1274741326608717e-07, "loss": 1.4044344425201416e-06, "num_turns": 2.0, "reward": 0.5489836931228638, "reward_std": 0.3799617290496826, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5489837527275085, "rewards/MLPCodeOnPolicy32BORM/std": 0.4073076844215393, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3207.0, "completions/mean_length": 1224.5, "completions/min_length": 463.0, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.0, "grad_norm": 0.12528652678460261, "kl": 0.001363825639600691, "learning_rate": 3.101312390915634e-07, "loss": 1.3401731848716736e-06, "num_turns": 2.0, "reward": 0.5499537587165833, "reward_std": 0.41209450364112854, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5499536991119385, "rewards/MLPCodeOnPolicy32BORM/std": 0.42937520146369934, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3540.0, "completions/mean_length": 1237.4140625, "completions/min_length": 495.0, "epoch": 0.43197278911564624, "frac_reward_zero_std": 0.0, "grad_norm": 0.13733437857964267, "kl": 0.0014534044730680762, "learning_rate": 3.075080566651544e-07, "loss": 1.4118850231170654e-06, "num_turns": 2.0, "reward": 0.5841702818870544, "reward_std": 0.3928337097167969, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5841702818870544, "rewards/MLPCodeOnPolicy32BORM/std": 0.40045365691185, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6243.0, "completions/mean_length": 1382.99609375, "completions/min_length": 428.0, "epoch": 0.43537414965986393, "frac_reward_zero_std": 0.0, "grad_norm": 0.12486667422213141, "kl": 0.0012770373573403049, "learning_rate": 3.048781717168513e-07, "loss": 1.261010766029358e-06, "num_turns": 2.0, "reward": 0.5601485967636108, "reward_std": 0.3312610387802124, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5601485967636108, "rewards/MLPCodeOnPolicy32BORM/std": 0.35082247853279114, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3410.0, "completions/mean_length": 1243.3359375, "completions/min_length": 499.0, "epoch": 0.4387755102040816, "frac_reward_zero_std": 0.0, "grad_norm": 0.12080802782214185, "kl": 0.0014379564872797346, "learning_rate": 3.022418907578188e-07, "loss": 1.3951212167739868e-06, "num_turns": 2.0, "reward": 0.5687495470046997, "reward_std": 0.32862186431884766, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5687495470046997, "rewards/MLPCodeOnPolicy32BORM/std": 0.3444008231163025, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1209.078125, "completions/min_length": 424.0, "epoch": 0.4421768707482993, "frac_reward_zero_std": 0.0, "grad_norm": 0.13074055235654158, "kl": 0.0014855460885883076, "learning_rate": 2.9959952104467243e-07, "loss": 1.4938414096832275e-06, "num_turns": 2.0, "reward": 0.4377948045730591, "reward_std": 0.3570207357406616, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4377948045730591, "rewards/MLPCodeOnPolicy32BORM/std": 0.37782377004623413, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4741.0, "completions/mean_length": 1240.36328125, "completions/min_length": 440.0, "epoch": 0.445578231292517, "frac_reward_zero_std": 0.0, "grad_norm": 0.1331763029150445, "kl": 0.001475883183957194, "learning_rate": 2.9695137054366753e-07, "loss": 1.4230608940124512e-06, "num_turns": 2.0, "reward": 0.5108271241188049, "reward_std": 0.36244717240333557, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5108271241188049, "rewards/MLPCodeOnPolicy32BORM/std": 0.36959731578826904, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6489.0, "completions/mean_length": 1305.765625, "completions/min_length": 430.0, "epoch": 0.4489795918367347, "frac_reward_zero_std": 0.0, "grad_norm": 0.1290586781632192, "kl": 0.001309950057475362, "learning_rate": 2.942977478948057e-07, "loss": 1.296401023864746e-06, "num_turns": 2.0, "reward": 0.5338167548179626, "reward_std": 0.4010734260082245, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5338166952133179, "rewards/MLPCodeOnPolicy32BORM/std": 0.4047679305076599, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5633.0, "completions/mean_length": 1277.81640625, "completions/min_length": 414.0, "epoch": 0.4523809523809524, "frac_reward_zero_std": 0.0, "grad_norm": 0.13533104933339096, "kl": 0.0014049280389372143, "learning_rate": 2.916389623758636e-07, "loss": 1.3941898941993713e-06, "num_turns": 2.0, "reward": 0.4498026967048645, "reward_std": 0.4285493791103363, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4498027265071869, "rewards/MLPCodeOnPolicy32BORM/std": 0.4484274387359619, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4146.0, "completions/mean_length": 1157.87109375, "completions/min_length": 483.0, "epoch": 0.4557823129251701, "frac_reward_zero_std": 0.0, "grad_norm": 0.1429767214006691, "kl": 0.0016457732672279235, "learning_rate": 2.889753238663466e-07, "loss": 1.6689300537109375e-06, "num_turns": 2.0, "reward": 0.4900895953178406, "reward_std": 0.4475466310977936, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4900895953178406, "rewards/MLPCodeOnPolicy32BORM/std": 0.44956454634666443, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3353.0, "completions/mean_length": 1228.65625, "completions/min_length": 536.0, "epoch": 0.45918367346938777, "frac_reward_zero_std": 0.0, "grad_norm": 0.12608630288605216, "kl": 0.001589239196619019, "learning_rate": 2.863071428113726e-07, "loss": 1.5730038285255432e-06, "num_turns": 2.0, "reward": 0.42409783601760864, "reward_std": 0.4034125804901123, "rewards/MLPCodeOnPolicy32BORM/mean": 0.42409780621528625, "rewards/MLPCodeOnPolicy32BORM/std": 0.41312265396118164, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3471.0, "completions/mean_length": 1234.89453125, "completions/min_length": 612.0, "epoch": 0.46258503401360546, "frac_reward_zero_std": 0.0, "grad_norm": 0.12497242950925527, "kl": 0.001500208744801057, "learning_rate": 2.836347301854897e-07, "loss": 1.475214958190918e-06, "num_turns": 2.0, "reward": 0.5871116518974304, "reward_std": 0.2970415949821472, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5871115922927856, "rewards/MLPCodeOnPolicy32BORM/std": 0.32421791553497314, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2919.0, "completions/mean_length": 1207.34765625, "completions/min_length": 409.0, "epoch": 0.46598639455782315, "frac_reward_zero_std": 0.0, "grad_norm": 0.13123736127619876, "kl": 0.0015240276952681597, "learning_rate": 2.8095839745643255e-07, "loss": 1.5515834093093872e-06, "num_turns": 2.0, "reward": 0.5804826021194458, "reward_std": 0.3200109302997589, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5804826021194458, "rewards/MLPCodeOnPolicy32BORM/std": 0.3496645390987396, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4525.0, "completions/mean_length": 1346.73046875, "completions/min_length": 507.0, "epoch": 0.46938775510204084, "frac_reward_zero_std": 0.0, "grad_norm": 0.12325423385372133, "kl": 0.0013416068013611948, "learning_rate": 2.782784565488211e-07, "loss": 1.3336539268493652e-06, "num_turns": 2.0, "reward": 0.3281669020652771, "reward_std": 0.4673292338848114, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3281669020652771, "rewards/MLPCodeOnPolicy32BORM/std": 0.47725093364715576, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4483.0, "completions/mean_length": 1220.921875, "completions/min_length": 450.0, "epoch": 0.47278911564625853, "frac_reward_zero_std": 0.0, "grad_norm": 0.12885536248220428, "kl": 0.00149958189103927, "learning_rate": 2.7559521980780566e-07, "loss": 1.4808028936386108e-06, "num_turns": 2.0, "reward": 0.532761812210083, "reward_std": 0.34896931052207947, "rewards/MLPCodeOnPolicy32BORM/mean": 0.532761812210083, "rewards/MLPCodeOnPolicy32BORM/std": 0.40563684701919556, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3037.0, "completions/mean_length": 1221.2890625, "completions/min_length": 488.0, "epoch": 0.47619047619047616, "frac_reward_zero_std": 0.0, "grad_norm": 0.13869574814859498, "kl": 0.0014890359529999841, "learning_rate": 2.729089999626637e-07, "loss": 1.4794059097766876e-06, "num_turns": 2.0, "reward": 0.4990660846233368, "reward_std": 0.39672043919563293, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4990660846233368, "rewards/MLPCodeOnPolicy32BORM/std": 0.41187581419944763, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4350.0, "completions/mean_length": 1237.12890625, "completions/min_length": 502.0, "epoch": 0.47959183673469385, "frac_reward_zero_std": 0.0, "grad_norm": 0.12395471455250524, "kl": 0.0014677665994895506, "learning_rate": 2.7022011009035107e-07, "loss": 1.3653188943862915e-06, "num_turns": 2.0, "reward": 0.4990885257720947, "reward_std": 0.4302549362182617, "rewards/MLPCodeOnPolicy32BORM/mean": 0.49908849596977234, "rewards/MLPCodeOnPolicy32BORM/std": 0.43232402205467224, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3966.0, "completions/mean_length": 1265.9140625, "completions/min_length": 425.0, "epoch": 0.48299319727891155, "frac_reward_zero_std": 0.0, "grad_norm": 0.13012857456910262, "kl": 0.0014557794766005827, "learning_rate": 2.675288635790135e-07, "loss": 1.385807991027832e-06, "num_turns": 2.0, "reward": 0.4568708539009094, "reward_std": 0.4049414396286011, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4568708539009094, "rewards/MLPCodeOnPolicy32BORM/std": 0.4264875650405884, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1365.4765625, "completions/min_length": 420.0, "epoch": 0.48639455782312924, "frac_reward_zero_std": 0.0, "grad_norm": 0.11805162085928556, "kl": 0.0013945812579549965, "learning_rate": 2.648355740914613e-07, "loss": 1.4118850231170654e-06, "num_turns": 2.0, "reward": 0.4730561375617981, "reward_std": 0.38758885860443115, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4730561375617981, "rewards/MLPCodeOnPolicy32BORM/std": 0.40097248554229736, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3269.0, "completions/mean_length": 1158.1328125, "completions/min_length": 486.0, "epoch": 0.4897959183673469, "frac_reward_zero_std": 0.0, "grad_norm": 0.13022193041479954, "kl": 0.0015572439251627657, "learning_rate": 2.621405555286121e-07, "loss": 1.6046687960624695e-06, "num_turns": 2.0, "reward": 0.4237700402736664, "reward_std": 0.4052298665046692, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4237700402736664, "rewards/MLPCodeOnPolicy32BORM/std": 0.4302515387535095, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4477.0, "completions/mean_length": 1271.70703125, "completions/min_length": 438.0, "epoch": 0.4931972789115646, "frac_reward_zero_std": 0.0, "grad_norm": 0.13026005270843405, "kl": 0.0014919168093001645, "learning_rate": 2.594441219929058e-07, "loss": 1.5385448932647705e-06, "num_turns": 2.0, "reward": 0.4565078318119049, "reward_std": 0.43446728587150574, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4565078318119049, "rewards/MLPCodeOnPolicy32BORM/std": 0.4460132122039795, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3277.0, "completions/mean_length": 1155.765625, "completions/min_length": 569.0, "epoch": 0.4965986394557823, "frac_reward_zero_std": 0.0, "grad_norm": 0.12931491561349248, "kl": 0.0016008713791961782, "learning_rate": 2.5674658775169677e-07, "loss": 1.6149133443832397e-06, "num_turns": 2.0, "reward": 0.4877603054046631, "reward_std": 0.41081666946411133, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4877602756023407, "rewards/MLPCodeOnPolicy32BORM/std": 0.43899431824684143, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3038.0, "completions/mean_length": 1189.8828125, "completions/min_length": 449.0, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.12702661749227706, "kl": 0.0015393904413940618, "learning_rate": 2.540482672006254e-07, "loss": 1.5124678611755371e-06, "num_turns": 2.0, "reward": 0.500877857208252, "reward_std": 0.3198213577270508, "rewards/MLPCodeOnPolicy32BORM/mean": 0.500877857208252, "rewards/MLPCodeOnPolicy32BORM/std": 0.3381810188293457, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3720.0, "completions/mean_length": 1171.10546875, "completions/min_length": 432.0, "epoch": 0.5034013605442177, "frac_reward_zero_std": 0.0, "grad_norm": 0.134361927289558, "kl": 0.0016590512541370117, "learning_rate": 2.513494748269761e-07, "loss": 1.6801059246063232e-06, "num_turns": 2.0, "reward": 0.5662176012992859, "reward_std": 0.3807193636894226, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5662176012992859, "rewards/MLPCodeOnPolicy32BORM/std": 0.38985222578048706, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3483.0, "completions/mean_length": 1202.5234375, "completions/min_length": 473.0, "epoch": 0.5068027210884354, "frac_reward_zero_std": 0.0, "grad_norm": 0.12963409344180143, "kl": 0.001650203145800333, "learning_rate": 2.4865052517302394e-07, "loss": 1.5692785382270813e-06, "num_turns": 2.0, "reward": 0.5353552103042603, "reward_std": 0.37545841932296753, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5353552103042603, "rewards/MLPCodeOnPolicy32BORM/std": 0.3899039626121521, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3251.0, "completions/mean_length": 1229.546875, "completions/min_length": 406.0, "epoch": 0.5102040816326531, "frac_reward_zero_std": 0.0, "grad_norm": 0.12807932315566303, "kl": 0.001541927182188374, "learning_rate": 2.459517327993746e-07, "loss": 1.5497207641601562e-06, "num_turns": 2.0, "reward": 0.4594876766204834, "reward_std": 0.3596667945384979, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4594876766204834, "rewards/MLPCodeOnPolicy32BORM/std": 0.39366379380226135, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4203.0, "completions/mean_length": 1162.60546875, "completions/min_length": 493.0, "epoch": 0.5136054421768708, "frac_reward_zero_std": 0.0, "grad_norm": 0.13195509056801039, "kl": 0.0017888068650790956, "learning_rate": 2.4325341224830326e-07, "loss": 1.7918646335601807e-06, "num_turns": 2.0, "reward": 0.7158861756324768, "reward_std": 0.2936908006668091, "rewards/MLPCodeOnPolicy32BORM/mean": 0.7158861756324768, "rewards/MLPCodeOnPolicy32BORM/std": 0.3239665925502777, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3430.0, "completions/mean_length": 1246.49609375, "completions/min_length": 502.0, "epoch": 0.5170068027210885, "frac_reward_zero_std": 0.0, "grad_norm": 0.12349750689603009, "kl": 0.0015071489342517452, "learning_rate": 2.405558780070942e-07, "loss": 1.4919787645339966e-06, "num_turns": 2.0, "reward": 0.47492241859436035, "reward_std": 0.4099407494068146, "rewards/MLPCodeOnPolicy32BORM/mean": 0.47492241859436035, "rewards/MLPCodeOnPolicy32BORM/std": 0.4219662547111511, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3985.0, "completions/mean_length": 1217.10546875, "completions/min_length": 489.0, "epoch": 0.5204081632653061, "frac_reward_zero_std": 0.0, "grad_norm": 0.14744966056557027, "kl": 0.0015069277460497688, "learning_rate": 2.37859444471388e-07, "loss": 1.4007091522216797e-06, "num_turns": 2.0, "reward": 0.39524000883102417, "reward_std": 0.4498276710510254, "rewards/MLPCodeOnPolicy32BORM/mean": 0.3952399790287018, "rewards/MLPCodeOnPolicy32BORM/std": 0.4426690936088562, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4924.0, "completions/mean_length": 1208.10546875, "completions/min_length": 449.0, "epoch": 0.5238095238095238, "frac_reward_zero_std": 0.0, "grad_norm": 0.1270108879721089, "kl": 0.0017198402274516411, "learning_rate": 2.3516442590853866e-07, "loss": 1.7210841178894043e-06, "num_turns": 2.0, "reward": 0.5516296625137329, "reward_std": 0.410399854183197, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5516296625137329, "rewards/MLPCodeOnPolicy32BORM/std": 0.40337714552879333, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6468.0, "completions/mean_length": 1229.87890625, "completions/min_length": 399.0, "epoch": 0.5272108843537415, "frac_reward_zero_std": 0.0, "grad_norm": 0.13149493698674622, "kl": 0.0015122121412787237, "learning_rate": 2.3247113642098645e-07, "loss": 1.5497207641601562e-06, "num_turns": 2.0, "reward": 0.476318895816803, "reward_std": 0.42769843339920044, "rewards/MLPCodeOnPolicy32BORM/mean": 0.476318895816803, "rewards/MLPCodeOnPolicy32BORM/std": 0.4319699704647064, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1269.3125, "completions/min_length": 429.0, "epoch": 0.5306122448979592, "frac_reward_zero_std": 0.0, "grad_norm": 0.1304574189322417, "kl": 0.0015285064409908955, "learning_rate": 2.2977988990964896e-07, "loss": 1.5534460544586182e-06, "num_turns": 2.0, "reward": 0.6005215048789978, "reward_std": 0.32493290305137634, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6005215048789978, "rewards/MLPCodeOnPolicy32BORM/std": 0.34041628241539, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4384.0, "completions/mean_length": 1235.43359375, "completions/min_length": 593.0, "epoch": 0.5340136054421769, "frac_reward_zero_std": 0.0, "grad_norm": 0.12127411001199988, "kl": 0.0016412290724474587, "learning_rate": 2.2709100003733634e-07, "loss": 1.519918441772461e-06, "num_turns": 2.0, "reward": 0.5233211517333984, "reward_std": 0.3616924583911896, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5233211517333984, "rewards/MLPCodeOnPolicy32BORM/std": 0.3967532515525818, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5871.0, "completions/mean_length": 1226.09375, "completions/min_length": 506.0, "epoch": 0.5374149659863946, "frac_reward_zero_std": 0.0, "grad_norm": 0.12594349415909045, "kl": 0.0013712193344872503, "learning_rate": 2.2440478019219437e-07, "loss": 1.3541430234909058e-06, "num_turns": 2.0, "reward": 0.5536775588989258, "reward_std": 0.34202492237091064, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5536775588989258, "rewards/MLPCodeOnPolicy32BORM/std": 0.35093897581100464, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2961.0, "completions/mean_length": 1209.2421875, "completions/min_length": 504.0, "epoch": 0.5408163265306123, "frac_reward_zero_std": 0.0, "grad_norm": 0.127306231377746, "kl": 0.0014342863169076736, "learning_rate": 2.2172154345117894e-07, "loss": 1.430511474609375e-06, "num_turns": 2.0, "reward": 0.5524036884307861, "reward_std": 0.32542017102241516, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5524036884307861, "rewards/MLPCodeOnPolicy32BORM/std": 0.333621621131897, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3949.0, "completions/mean_length": 1182.046875, "completions/min_length": 472.0, "epoch": 0.54421768707483, "frac_reward_zero_std": 0.0, "grad_norm": 0.130686322143201, "kl": 0.0014794531780353282, "learning_rate": 2.1904160254356748e-07, "loss": 1.4044344425201416e-06, "num_turns": 2.0, "reward": 0.6170496344566345, "reward_std": 0.3277736008167267, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6170496344566345, "rewards/MLPCodeOnPolicy32BORM/std": 0.3628644049167633, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2928.0, "completions/mean_length": 1152.2265625, "completions/min_length": 511.0, "epoch": 0.5476190476190477, "frac_reward_zero_std": 0.0, "grad_norm": 0.132823142268967, "kl": 0.0014511521703752805, "learning_rate": 2.1636526981451036e-07, "loss": 1.5888363122940063e-06, "num_turns": 2.0, "reward": 0.5477647185325623, "reward_std": 0.3859521746635437, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5477646589279175, "rewards/MLPCodeOnPolicy32BORM/std": 0.4129984378814697, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4990.0, "completions/mean_length": 1285.828125, "completions/min_length": 467.0, "epoch": 0.5510204081632653, "frac_reward_zero_std": 0.0, "grad_norm": 0.12785688650096694, "kl": 0.0013251102805043047, "learning_rate": 2.1369285718862748e-07, "loss": 1.259148120880127e-06, "num_turns": 2.0, "reward": 0.5367299914360046, "reward_std": 0.3701598346233368, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5367300510406494, "rewards/MLPCodeOnPolicy32BORM/std": 0.3895636796951294, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4772.0, "completions/mean_length": 1222.5625, "completions/min_length": 446.0, "epoch": 0.5544217687074829, "frac_reward_zero_std": 0.0, "grad_norm": 0.1330333320092087, "kl": 0.0012835902516599162, "learning_rate": 2.1102467613365334e-07, "loss": 1.3317912817001343e-06, "num_turns": 2.0, "reward": 0.5249505043029785, "reward_std": 0.383068710565567, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5249505043029785, "rewards/MLPCodeOnPolicy32BORM/std": 0.39008262753486633, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2926.0, "completions/mean_length": 1287.45703125, "completions/min_length": 528.0, "epoch": 0.5578231292517006, "frac_reward_zero_std": 0.0, "grad_norm": 0.1407269238620054, "kl": 0.0013230719678176683, "learning_rate": 2.0836103762413638e-07, "loss": 1.344829797744751e-06, "num_turns": 2.0, "reward": 0.5449553728103638, "reward_std": 0.4117432236671448, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5449553728103638, "rewards/MLPCodeOnPolicy32BORM/std": 0.4277450442314148, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3309.0, "completions/mean_length": 1280.25390625, "completions/min_length": 509.0, "epoch": 0.5612244897959183, "frac_reward_zero_std": 0.0, "grad_norm": 0.12404282707169718, "kl": 0.0013554931147155003, "learning_rate": 2.0570225210519433e-07, "loss": 1.3336539268493652e-06, "num_turns": 2.0, "reward": 0.5558387041091919, "reward_std": 0.3796404004096985, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5558386445045471, "rewards/MLPCodeOnPolicy32BORM/std": 0.41854989528656006, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3314.0, "completions/mean_length": 1228.93359375, "completions/min_length": 471.0, "epoch": 0.564625850340136, "frac_reward_zero_std": 0.0, "grad_norm": 0.12571021073988586, "kl": 0.0013914067976656952, "learning_rate": 2.0304862945633247e-07, "loss": 1.4193356037139893e-06, "num_turns": 2.0, "reward": 0.5028097033500671, "reward_std": 0.36212992668151855, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5028097033500671, "rewards/MLPCodeOnPolicy32BORM/std": 0.36804434657096863, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3651.0, "completions/mean_length": 1311.5703125, "completions/min_length": 564.0, "epoch": 0.5680272108843537, "frac_reward_zero_std": 0.0, "grad_norm": 0.12460650819042984, "kl": 0.0012771672645612853, "learning_rate": 2.0040047895532752e-07, "loss": 1.2405216693878174e-06, "num_turns": 2.0, "reward": 0.501116931438446, "reward_std": 0.3999606668949127, "rewards/MLPCodeOnPolicy32BORM/mean": 0.501116931438446, "rewards/MLPCodeOnPolicy32BORM/std": 0.4230176508426666, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1383.45703125, "completions/min_length": 521.0, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.1184463202674069, "kl": 0.001180576521619514, "learning_rate": 1.977581092421812e-07, "loss": 1.173466444015503e-06, "num_turns": 2.0, "reward": 0.39330971240997314, "reward_std": 0.4362216889858246, "rewards/MLPCodeOnPolicy32BORM/mean": 0.39330971240997314, "rewards/MLPCodeOnPolicy32BORM/std": 0.4408242404460907, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3373.0, "completions/mean_length": 1204.75390625, "completions/min_length": 447.0, "epoch": 0.5748299319727891, "frac_reward_zero_std": 0.0, "grad_norm": 0.12632851795944025, "kl": 0.0014828696912445594, "learning_rate": 1.9512182828314882e-07, "loss": 1.475214958190918e-06, "num_turns": 2.0, "reward": 0.5679004788398743, "reward_std": 0.33988669514656067, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5679004192352295, "rewards/MLPCodeOnPolicy32BORM/std": 0.36474913358688354, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3822.0, "completions/mean_length": 1174.91015625, "completions/min_length": 435.0, "epoch": 0.5782312925170068, "frac_reward_zero_std": 0.0, "grad_norm": 0.1272662009578314, "kl": 0.001407989510880725, "learning_rate": 1.9249194333484563e-07, "loss": 1.3150274753570557e-06, "num_turns": 2.0, "reward": 0.6547111868858337, "reward_std": 0.25977566838264465, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6547111868858337, "rewards/MLPCodeOnPolicy32BORM/std": 0.2951057255268097, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2913.0, "completions/mean_length": 1225.19140625, "completions/min_length": 564.0, "epoch": 0.5816326530612245, "frac_reward_zero_std": 0.0, "grad_norm": 0.12591804144081578, "kl": 0.0013545075703405018, "learning_rate": 1.8986876090843664e-07, "loss": 1.4118850231170654e-06, "num_turns": 2.0, "reward": 0.5509231686592102, "reward_std": 0.31451380252838135, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5509231090545654, "rewards/MLPCodeOnPolicy32BORM/std": 0.34253573417663574, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5525.0, "completions/mean_length": 1350.0078125, "completions/min_length": 545.0, "epoch": 0.5850340136054422, "frac_reward_zero_std": 0.0, "grad_norm": 0.1184334593846969, "kl": 0.0012318995350142359, "learning_rate": 1.872525867339128e-07, "loss": 1.2330710887908936e-06, "num_turns": 2.0, "reward": 0.5996668338775635, "reward_std": 0.35077401995658875, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5996668338775635, "rewards/MLPCodeOnPolicy32BORM/std": 0.36193662881851196, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4773.0, "completions/mean_length": 1289.87109375, "completions/min_length": 497.0, "epoch": 0.5884353741496599, "frac_reward_zero_std": 0.0, "grad_norm": 0.13773093289384147, "kl": 0.0013899512669013347, "learning_rate": 1.8464372572445863e-07, "loss": 1.434236764907837e-06, "num_turns": 2.0, "reward": 0.5270853638648987, "reward_std": 0.40748390555381775, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5270853638648987, "rewards/MLPCodeOnPolicy32BORM/std": 0.41710126399993896, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3433.0, "completions/mean_length": 1167.64453125, "completions/min_length": 505.0, "epoch": 0.5918367346938775, "frac_reward_zero_std": 0.0, "grad_norm": 0.12112995046800766, "kl": 0.0013812177198815334, "learning_rate": 1.8204248194091425e-07, "loss": 1.344829797744751e-06, "num_turns": 2.0, "reward": 0.6126636862754822, "reward_std": 0.3266138434410095, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6126636862754822, "rewards/MLPCodeOnPolicy32BORM/std": 0.3397049903869629, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3811.0, "completions/mean_length": 1326.52734375, "completions/min_length": 448.0, "epoch": 0.5952380952380952, "frac_reward_zero_std": 0.0, "grad_norm": 0.129796948317958, "kl": 0.001184793375159643, "learning_rate": 1.7944915855633807e-07, "loss": 1.3085082173347473e-06, "num_turns": 2.0, "reward": 0.44264623522758484, "reward_std": 0.3939417004585266, "rewards/MLPCodeOnPolicy32BORM/mean": 0.44264620542526245, "rewards/MLPCodeOnPolicy32BORM/std": 0.42343512177467346, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3019.0, "completions/mean_length": 1266.4609375, "completions/min_length": 404.0, "epoch": 0.5986394557823129, "frac_reward_zero_std": 0.0, "grad_norm": 0.1207800852669321, "kl": 0.0013622808933178021, "learning_rate": 1.768640578206715e-07, "loss": 1.2665987014770508e-06, "num_turns": 2.0, "reward": 0.5672487616539001, "reward_std": 0.3637806475162506, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5672487616539001, "rewards/MLPCodeOnPolicy32BORM/std": 0.39485296607017517, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3501.0, "completions/mean_length": 1274.89453125, "completions/min_length": 405.0, "epoch": 0.6020408163265306, "frac_reward_zero_std": 0.0, "grad_norm": 0.11969351864846854, "kl": 0.0013109368346704287, "learning_rate": 1.7428748102551234e-07, "loss": 1.3485550880432129e-06, "num_turns": 2.0, "reward": 0.5126824378967285, "reward_std": 0.360828161239624, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5126823782920837, "rewards/MLPCodeOnPolicy32BORM/std": 0.37800562381744385, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3133.0, "completions/mean_length": 1312.5078125, "completions/min_length": 424.0, "epoch": 0.6054421768707483, "frac_reward_zero_std": 0.0, "grad_norm": 0.1199661599278971, "kl": 0.0012908925173178432, "learning_rate": 1.7171972846899941e-07, "loss": 1.259148120880127e-06, "num_turns": 2.0, "reward": 0.5275952816009521, "reward_std": 0.38074105978012085, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5275952816009521, "rewards/MLPCodeOnPolicy32BORM/std": 0.39470240473747253, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5515.0, "completions/mean_length": 1352.41015625, "completions/min_length": 405.0, "epoch": 0.608843537414966, "frac_reward_zero_std": 0.0, "grad_norm": 0.12621918244168606, "kl": 0.0012085978110008, "learning_rate": 1.691610994208129e-07, "loss": 1.2516975402832031e-06, "num_turns": 2.0, "reward": 0.46217256784439087, "reward_std": 0.416616827249527, "rewards/MLPCodeOnPolicy32BORM/mean": 0.46217256784439087, "rewards/MLPCodeOnPolicy32BORM/std": 0.40671467781066895, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2739.0, "completions/mean_length": 1107.76171875, "completions/min_length": 435.0, "epoch": 0.6122448979591837, "frac_reward_zero_std": 0.0, "grad_norm": 0.13966438978348047, "kl": 0.001379939973958244, "learning_rate": 1.6661189208729489e-07, "loss": 1.5422701835632324e-06, "num_turns": 2.0, "reward": 0.6245121359825134, "reward_std": 0.3324832022190094, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6245121359825134, "rewards/MLPCodeOnPolicy32BORM/std": 0.3664103150367737, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2833.0, "completions/mean_length": 1239.33203125, "completions/min_length": 295.0, "epoch": 0.6156462585034014, "frac_reward_zero_std": 0.0, "grad_norm": 0.12426157616035632, "kl": 0.0013459335514198756, "learning_rate": 1.6407240357669332e-07, "loss": 1.4137476682662964e-06, "num_turns": 2.0, "reward": 0.5082270503044128, "reward_std": 0.3264538645744324, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5082271099090576, "rewards/MLPCodeOnPolicy32BORM/std": 0.3465285301208496, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3867.0, "completions/mean_length": 1174.68359375, "completions/min_length": 446.0, "epoch": 0.6190476190476191, "frac_reward_zero_std": 0.0, "grad_norm": 0.13358274021087882, "kl": 0.00134277267534344, "learning_rate": 1.6154292986453483e-07, "loss": 1.3802200555801392e-06, "num_turns": 2.0, "reward": 0.5906236171722412, "reward_std": 0.34040746092796326, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5906236171722412, "rewards/MLPCodeOnPolicy32BORM/std": 0.3793966770172119, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1359.40234375, "completions/min_length": 454.0, "epoch": 0.6224489795918368, "frac_reward_zero_std": 0.0, "grad_norm": 0.12503538969545513, "kl": 0.0013009100903218496, "learning_rate": 1.5902376575912814e-07, "loss": 1.3336539268493652e-06, "num_turns": 2.0, "reward": 0.5492093563079834, "reward_std": 0.3843424916267395, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5492092967033386, "rewards/MLPCodeOnPolicy32BORM/std": 0.41074615716934204, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2676.0, "completions/mean_length": 1193.87109375, "completions/min_length": 499.0, "epoch": 0.6258503401360545, "frac_reward_zero_std": 0.0, "grad_norm": 0.1208639891379796, "kl": 0.0013548607312259264, "learning_rate": 1.5651520486720516e-07, "loss": 1.3280659914016724e-06, "num_turns": 2.0, "reward": 0.5216140151023865, "reward_std": 0.32233840227127075, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5216140151023865, "rewards/MLPCodeOnPolicy32BORM/std": 0.3560025990009308, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3305.0, "completions/mean_length": 1245.7109375, "completions/min_length": 494.0, "epoch": 0.6292517006802721, "frac_reward_zero_std": 0.0, "grad_norm": 0.12686236905432466, "kl": 0.0012695216200881987, "learning_rate": 1.5401753955970097e-07, "loss": 1.30385160446167e-06, "num_turns": 2.0, "reward": 0.5973302721977234, "reward_std": 0.3663652539253235, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5973303318023682, "rewards/MLPCodeOnPolicy32BORM/std": 0.3876189589500427, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3152.0, "completions/mean_length": 1213.82421875, "completions/min_length": 502.0, "epoch": 0.6326530612244898, "frac_reward_zero_std": 0.0, "grad_norm": 0.12885973798806175, "kl": 0.001346839189864113, "learning_rate": 1.5153106093767825e-07, "loss": 1.3206154108047485e-06, "num_turns": 2.0, "reward": 0.6149528622627258, "reward_std": 0.3335758447647095, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6149528622627258, "rewards/MLPCodeOnPolicy32BORM/std": 0.3618091940879822, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4436.0, "completions/mean_length": 1248.99609375, "completions/min_length": 466.0, "epoch": 0.6360544217687075, "frac_reward_zero_std": 0.0, "grad_norm": 0.12657431869640567, "kl": 0.0012566588084155228, "learning_rate": 1.490560587983996e-07, "loss": 1.126900315284729e-06, "num_turns": 2.0, "reward": 0.6170510649681091, "reward_std": 0.28795865178108215, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6170510649681091, "rewards/MLPCodeOnPolicy32BORM/std": 0.3206290006637573, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3711.0, "completions/mean_length": 1272.578125, "completions/min_length": 491.0, "epoch": 0.6394557823129252, "frac_reward_zero_std": 0.0, "grad_norm": 0.12319663586080123, "kl": 0.0012713064520539774, "learning_rate": 1.465928216015522e-07, "loss": 1.1995434761047363e-06, "num_turns": 2.0, "reward": 0.5160113573074341, "reward_std": 0.38475197553634644, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5160113573074341, "rewards/MLPCodeOnPolicy32BORM/std": 0.41495686769485474, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3447.0, "completions/mean_length": 1222.0703125, "completions/min_length": 443.0, "epoch": 0.6428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.12426694619632161, "kl": 0.001386047030791815, "learning_rate": 1.4414163643562753e-07, "loss": 1.4081597328186035e-06, "num_turns": 2.0, "reward": 0.5270590782165527, "reward_std": 0.3698263168334961, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5270590782165527, "rewards/MLPCodeOnPolicy32BORM/std": 0.3735743463039398, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3303.0, "completions/mean_length": 1276.09375, "completions/min_length": 372.0, "epoch": 0.6462585034013606, "frac_reward_zero_std": 0.0, "grad_norm": 0.13033658908632512, "kl": 0.0013309565947565716, "learning_rate": 1.4170278898446175e-07, "loss": 1.3373792171478271e-06, "num_turns": 2.0, "reward": 0.5525405406951904, "reward_std": 0.4099390506744385, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5525405406951904, "rewards/MLPCodeOnPolicy32BORM/std": 0.42479801177978516, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3087.0, "completions/mean_length": 1215.47265625, "completions/min_length": 481.0, "epoch": 0.6496598639455783, "frac_reward_zero_std": 0.0, "grad_norm": 0.13086596111770168, "kl": 0.0013185567531763809, "learning_rate": 1.3927656349393952e-07, "loss": 1.3075768947601318e-06, "num_turns": 2.0, "reward": 0.5212171673774719, "reward_std": 0.4619579315185547, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5212171077728271, "rewards/MLPCodeOnPolicy32BORM/std": 0.45611312985420227, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3030.0, "completions/mean_length": 1284.39453125, "completions/min_length": 457.0, "epoch": 0.6530612244897959, "frac_reward_zero_std": 0.0, "grad_norm": 0.12121365836513875, "kl": 0.0012196112975288997, "learning_rate": 1.3686324273886528e-07, "loss": 1.2903474271297455e-06, "num_turns": 2.0, "reward": 0.5198047161102295, "reward_std": 0.408640056848526, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5198047161102295, "rewards/MLPCodeOnPolicy32BORM/std": 0.4179668426513672, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4217.0, "completions/mean_length": 1236.8125, "completions/min_length": 503.0, "epoch": 0.6564625850340136, "frac_reward_zero_std": 0.0, "grad_norm": 0.11991459011860725, "kl": 0.0013542722726924694, "learning_rate": 1.3446310799000575e-07, "loss": 1.2852251529693604e-06, "num_turns": 2.0, "reward": 0.672049880027771, "reward_std": 0.33700209856033325, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6720498204231262, "rewards/MLPCodeOnPolicy32BORM/std": 0.37094196677207947, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3159.0, "completions/mean_length": 1212.5703125, "completions/min_length": 509.0, "epoch": 0.6598639455782312, "frac_reward_zero_std": 0.0, "grad_norm": 0.13003630784097575, "kl": 0.0013068479593130178, "learning_rate": 1.3207643898130853e-07, "loss": 1.2461096048355103e-06, "num_turns": 2.0, "reward": 0.5808924436569214, "reward_std": 0.3667425215244293, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5808924436569214, "rewards/MLPCodeOnPolicy32BORM/std": 0.4038356840610504, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3690.0, "completions/mean_length": 1275.81640625, "completions/min_length": 414.0, "epoch": 0.6632653061224489, "frac_reward_zero_std": 0.0, "grad_norm": 0.13731411089997758, "kl": 0.001287659304580302, "learning_rate": 1.2970351387729872e-07, "loss": 1.2349337339401245e-06, "num_turns": 2.0, "reward": 0.5368715524673462, "reward_std": 0.3973066806793213, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5368715524673462, "rewards/MLPCodeOnPolicy32BORM/std": 0.4072633683681488, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3239.0, "completions/mean_length": 1227.8125, "completions/min_length": 412.0, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 0.13545176265600176, "kl": 0.0013441352293739328, "learning_rate": 1.273446092406599e-07, "loss": 1.2777745723724365e-06, "num_turns": 2.0, "reward": 0.5994586944580078, "reward_std": 0.36241117119789124, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5994586944580078, "rewards/MLPCodeOnPolicy32BORM/std": 0.36684757471084595, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2883.0, "completions/mean_length": 1199.0234375, "completions/min_length": 453.0, "epoch": 0.6700680272108843, "frac_reward_zero_std": 0.0, "grad_norm": 0.13288052160082212, "kl": 0.001319561021773552, "learning_rate": 1.2500000000000005e-07, "loss": 1.4156103134155273e-06, "num_turns": 2.0, "reward": 0.5040369033813477, "reward_std": 0.40427684783935547, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5040369033813477, "rewards/MLPCodeOnPolicy32BORM/std": 0.4201582968235016, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2679.0, "completions/mean_length": 1189.41796875, "completions/min_length": 480.0, "epoch": 0.673469387755102, "frac_reward_zero_std": 0.0, "grad_norm": 0.12608554405627326, "kl": 0.0013950904140074272, "learning_rate": 1.2266995941780933e-07, "loss": 1.4156103134155273e-06, "num_turns": 2.0, "reward": 0.5608351230621338, "reward_std": 0.2745205760002136, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5608351230621338, "rewards/MLPCodeOnPolicy32BORM/std": 0.2968989610671997, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3910.0, "completions/mean_length": 1240.84765625, "completions/min_length": 452.0, "epoch": 0.6768707482993197, "frac_reward_zero_std": 0.0, "grad_norm": 0.12443194247315312, "kl": 0.0012590040068971575, "learning_rate": 1.2035475905861134e-07, "loss": 1.3299286365509033e-06, "num_turns": 2.0, "reward": 0.5148465633392334, "reward_std": 0.35695213079452515, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5148465633392334, "rewards/MLPCodeOnPolicy32BORM/std": 0.38320231437683105, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4820.0, "completions/mean_length": 1296.47265625, "completions/min_length": 470.0, "epoch": 0.6802721088435374, "frac_reward_zero_std": 0.0, "grad_norm": 0.12668886855433553, "kl": 0.0012762437327182852, "learning_rate": 1.1805466875731276e-07, "loss": 1.3140961527824402e-06, "num_turns": 2.0, "reward": 0.43766123056411743, "reward_std": 0.41216611862182617, "rewards/MLPCodeOnPolicy32BORM/mean": 0.43766123056411743, "rewards/MLPCodeOnPolicy32BORM/std": 0.41272807121276855, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3375.0, "completions/mean_length": 1267.12890625, "completions/min_length": 451.0, "epoch": 0.6836734693877551, "frac_reward_zero_std": 0.0, "grad_norm": 0.12284357535625699, "kl": 0.0013010797565584653, "learning_rate": 1.1576995658775404e-07, "loss": 1.2200325727462769e-06, "num_turns": 2.0, "reward": 0.6425645351409912, "reward_std": 0.39451929926872253, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6425645351409912, "rewards/MLPCodeOnPolicy32BORM/std": 0.3956354558467865, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5039.0, "completions/mean_length": 1187.60546875, "completions/min_length": 477.0, "epoch": 0.6870748299319728, "frac_reward_zero_std": 0.0, "grad_norm": 0.1329605029871459, "kl": 0.0013695297720914823, "learning_rate": 1.1350088883146547e-07, "loss": 1.475214958190918e-06, "num_turns": 2.0, "reward": 0.5215319991111755, "reward_std": 0.3130980134010315, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5215319395065308, "rewards/MLPCodeOnPolicy32BORM/std": 0.34181302785873413, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3808.0, "completions/mean_length": 1268.234375, "completions/min_length": 530.0, "epoch": 0.6904761904761905, "frac_reward_zero_std": 0.0, "grad_norm": 0.12558617089585156, "kl": 0.001284107916035282, "learning_rate": 1.1124772994663256e-07, "loss": 1.2665987014770508e-06, "num_turns": 2.0, "reward": 0.48843446373939514, "reward_std": 0.3897768259048462, "rewards/MLPCodeOnPolicy32BORM/mean": 0.48843443393707275, "rewards/MLPCodeOnPolicy32BORM/std": 0.3982493281364441, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1252.0703125, "completions/min_length": 444.0, "epoch": 0.6938775510204082, "frac_reward_zero_std": 0.0, "grad_norm": 0.13137983301363407, "kl": 0.0013014281812502304, "learning_rate": 1.0901074253727336e-07, "loss": 1.2293457984924316e-06, "num_turns": 2.0, "reward": 0.511155366897583, "reward_std": 0.340930312871933, "rewards/MLPCodeOnPolicy32BORM/mean": 0.511155366897583, "rewards/MLPCodeOnPolicy32BORM/std": 0.3543807566165924, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6128.0, "completions/mean_length": 1238.28515625, "completions/min_length": 479.0, "epoch": 0.6972789115646258, "frac_reward_zero_std": 0.0, "grad_norm": 0.13356187086446547, "kl": 0.0013667331368196756, "learning_rate": 1.0679018732263257e-07, "loss": 1.3671815395355225e-06, "num_turns": 2.0, "reward": 0.5958702564239502, "reward_std": 0.39605003595352173, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5958702564239502, "rewards/MLPCodeOnPolicy32BORM/std": 0.4235108494758606, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3230.0, "completions/mean_length": 1246.95703125, "completions/min_length": 480.0, "epoch": 0.7006802721088435, "frac_reward_zero_std": 0.0, "grad_norm": 0.12808594455299088, "kl": 0.0013271556008476182, "learning_rate": 1.0458632310679438e-07, "loss": 1.4379620552062988e-06, "num_turns": 2.0, "reward": 0.6012597680091858, "reward_std": 0.3364196717739105, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6012598276138306, "rewards/MLPCodeOnPolicy32BORM/std": 0.3379861116409302, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3196.0, "completions/mean_length": 1140.3046875, "completions/min_length": 439.0, "epoch": 0.7040816326530612, "frac_reward_zero_std": 0.0, "grad_norm": 0.12792788035068012, "kl": 0.0013527601777241216, "learning_rate": 1.0239940674851941e-07, "loss": 1.3522803783416748e-06, "num_turns": 2.0, "reward": 0.5495452880859375, "reward_std": 0.3822019696235657, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5495453476905823, "rewards/MLPCodeOnPolicy32BORM/std": 0.3915736973285675, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 8193.0, "completions/mean_length": 1340.83984375, "completions/min_length": 539.0, "epoch": 0.7074829931972789, "frac_reward_zero_std": 0.0, "grad_norm": 0.12050380420709206, "kl": 0.0012090708578398335, "learning_rate": 1.0022969313130773e-07, "loss": 1.1548399925231934e-06, "num_turns": 2.0, "reward": 0.47699809074401855, "reward_std": 0.33950385451316833, "rewards/MLPCodeOnPolicy32BORM/mean": 0.47699812054634094, "rewards/MLPCodeOnPolicy32BORM/std": 0.3521897494792938, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5988.0, "completions/mean_length": 1326.73828125, "completions/min_length": 482.0, "epoch": 0.7108843537414966, "frac_reward_zero_std": 0.0, "grad_norm": 0.12634413168781206, "kl": 0.0012325601328484481, "learning_rate": 9.80774351336927e-08, "loss": 1.2740492820739746e-06, "num_turns": 2.0, "reward": 0.448714017868042, "reward_std": 0.43655937910079956, "rewards/MLPCodeOnPolicy32BORM/mean": 0.448714017868042, "rewards/MLPCodeOnPolicy32BORM/std": 0.42768001556396484, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/mean_length": 1206.16796875, "completions/min_length": 463.0, "epoch": 0.7142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1306756891257428, "kl": 0.0013014997630307334, "learning_rate": 9.594288359976815e-08, "loss": 1.3671815395355225e-06, "num_turns": 2.0, "reward": 0.5778207778930664, "reward_std": 0.410552442073822, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5778207182884216, "rewards/MLPCodeOnPolicy32BORM/std": 0.42478153109550476, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4431.0, "completions/mean_length": 1307.96875, "completions/min_length": 466.0, "epoch": 0.717687074829932, "frac_reward_zero_std": 0.0, "grad_norm": 0.12608798208719338, "kl": 0.0012636054516406148, "learning_rate": 9.38262873099522e-08, "loss": 1.3262033462524414e-06, "num_turns": 2.0, "reward": 0.41280120611190796, "reward_std": 0.4218251705169678, "rewards/MLPCodeOnPolicy32BORM/mean": 0.41280120611190796, "rewards/MLPCodeOnPolicy32BORM/std": 0.43390703201293945, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3796.0, "completions/mean_length": 1275.0859375, "completions/min_length": 506.0, "epoch": 0.7210884353741497, "frac_reward_zero_std": 0.0, "grad_norm": 0.12194629809618202, "kl": 0.0012277502910365001, "learning_rate": 9.172789295199254e-08, "loss": 1.173466444015503e-06, "num_turns": 2.0, "reward": 0.5452022552490234, "reward_std": 0.42787063121795654, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5452022552490234, "rewards/MLPCodeOnPolicy32BORM/std": 0.4353907108306885, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3460.0, "completions/mean_length": 1236.53125, "completions/min_length": 446.0, "epoch": 0.7244897959183674, "frac_reward_zero_std": 0.0, "grad_norm": 0.14036928021373707, "kl": 0.0013081711795166484, "learning_rate": 8.964794509221507e-08, "loss": 1.3136304914951324e-06, "num_turns": 2.0, "reward": 0.6131947040557861, "reward_std": 0.39594751596450806, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6131947040557861, "rewards/MLPCodeOnPolicy32BORM/std": 0.4020529091358185, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5378.0, "completions/mean_length": 1286.6015625, "completions/min_length": 437.0, "epoch": 0.7278911564625851, "frac_reward_zero_std": 0.0, "grad_norm": 0.12787518883894636, "kl": 0.001297479861023021, "learning_rate": 8.758668614701972e-08, "loss": 1.2405216693878174e-06, "num_turns": 2.0, "reward": 0.5273051857948303, "reward_std": 0.42367199063301086, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5273051261901855, "rewards/MLPCodeOnPolicy32BORM/std": 0.4331037104129791, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3119.0, "completions/mean_length": 1257.484375, "completions/min_length": 455.0, "epoch": 0.7312925170068028, "frac_reward_zero_std": 0.0, "grad_norm": 0.11980010487610358, "kl": 0.0013178105773477, "learning_rate": 8.55443563546274e-08, "loss": 1.3113021850585938e-06, "num_turns": 2.0, "reward": 0.4529898166656494, "reward_std": 0.41719964146614075, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4529898166656494, "rewards/MLPCodeOnPolicy32BORM/std": 0.46845582127571106, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3151.0, "completions/mean_length": 1218.0078125, "completions/min_length": 430.0, "epoch": 0.7346938775510204, "frac_reward_zero_std": 0.0, "grad_norm": 0.1331665548979272, "kl": 0.0013294228419908904, "learning_rate": 8.352119374707977e-08, "loss": 1.2814998626708984e-06, "num_turns": 2.0, "reward": 0.6287795305252075, "reward_std": 0.2791963517665863, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6287795305252075, "rewards/MLPCodeOnPolicy32BORM/std": 0.3135279417037964, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3813.0, "completions/mean_length": 1320.07421875, "completions/min_length": 381.0, "epoch": 0.7380952380952381, "frac_reward_zero_std": 0.0, "grad_norm": 0.13325778517888945, "kl": 0.001306778119669616, "learning_rate": 8.151743412249728e-08, "loss": 1.3438984751701355e-06, "num_turns": 2.0, "reward": 0.5232036709785461, "reward_std": 0.3668539822101593, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5232036113739014, "rewards/MLPCodeOnPolicy32BORM/std": 0.4027640223503113, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4038.0, "completions/mean_length": 1262.80859375, "completions/min_length": 521.0, "epoch": 0.7414965986394558, "frac_reward_zero_std": 0.0, "grad_norm": 0.1226930510944803, "kl": 0.0013263678656585398, "learning_rate": 7.953331101759705e-08, "loss": 1.389533281326294e-06, "num_turns": 2.0, "reward": 0.6287027597427368, "reward_std": 0.34670159220695496, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6287027597427368, "rewards/MLPCodeOnPolicy32BORM/std": 0.36597710847854614, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4104.0, "completions/mean_length": 1232.12109375, "completions/min_length": 429.0, "epoch": 0.7448979591836735, "frac_reward_zero_std": 0.0, "grad_norm": 0.12901763117209306, "kl": 0.0013840211349815945, "learning_rate": 7.756905568047392e-08, "loss": 1.3485550880432129e-06, "num_turns": 2.0, "reward": 0.5536688566207886, "reward_std": 0.3205341696739197, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5536688566207886, "rewards/MLPCodeOnPolicy32BORM/std": 0.3282807171344757, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2886.0, "completions/mean_length": 1263.96875, "completions/min_length": 482.0, "epoch": 0.7482993197278912, "frac_reward_zero_std": 0.0, "grad_norm": 0.12890254868272635, "kl": 0.0014110473175605875, "learning_rate": 7.56248970436493e-08, "loss": 1.4100223779678345e-06, "num_turns": 2.0, "reward": 0.6479886770248413, "reward_std": 0.35511314868927, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6479886770248413, "rewards/MLPCodeOnPolicy32BORM/std": 0.3817684054374695, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5501.0, "completions/mean_length": 1255.84765625, "completions/min_length": 455.0, "epoch": 0.7517006802721088, "frac_reward_zero_std": 0.0, "grad_norm": 0.12610719119192687, "kl": 0.0014363232407959003, "learning_rate": 7.37010616973886e-08, "loss": 1.2908130884170532e-06, "num_turns": 2.0, "reward": 0.5539280772209167, "reward_std": 0.4543180465698242, "rewards/MLPCodeOnPolicy32BORM/mean": 0.553928017616272, "rewards/MLPCodeOnPolicy32BORM/std": 0.4694295823574066, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3728.0, "completions/mean_length": 1300.24609375, "completions/min_length": 513.0, "epoch": 0.7551020408163265, "frac_reward_zero_std": 0.0, "grad_norm": 0.12843810609713965, "kl": 0.0012828246744902572, "learning_rate": 7.179777386329275e-08, "loss": 1.280568540096283e-06, "num_turns": 2.0, "reward": 0.5100784301757812, "reward_std": 0.40308377146720886, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5100784301757812, "rewards/MLPCodeOnPolicy32BORM/std": 0.4186839163303375, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3139.0, "completions/mean_length": 1211.234375, "completions/min_length": 432.0, "epoch": 0.7585034013605442, "frac_reward_zero_std": 0.0, "grad_norm": 0.13185135148687682, "kl": 0.001326795773820777, "learning_rate": 6.991525536816497e-08, "loss": 1.34296715259552e-06, "num_turns": 2.0, "reward": 0.46107718348503113, "reward_std": 0.3825857639312744, "rewards/MLPCodeOnPolicy32BORM/mean": 0.46107718348503113, "rewards/MLPCodeOnPolicy32BORM/std": 0.3778640627861023, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3264.0, "completions/mean_length": 1196.265625, "completions/min_length": 503.0, "epoch": 0.7619047619047619, "frac_reward_zero_std": 0.0, "grad_norm": 0.13298086952113006, "kl": 0.0014002320585859707, "learning_rate": 6.805372561815767e-08, "loss": 1.369975507259369e-06, "num_turns": 2.0, "reward": 0.5463467240333557, "reward_std": 0.4121183753013611, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5463467240333557, "rewards/MLPCodeOnPolicy32BORM/std": 0.42288950085639954, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1262.1328125, "completions/min_length": 456.0, "epoch": 0.7653061224489796, "frac_reward_zero_std": 0.0, "grad_norm": 0.1255035449090627, "kl": 0.0013547614798881114, "learning_rate": 6.621340157319996e-08, "loss": 1.2889504432678223e-06, "num_turns": 2.0, "reward": 0.5293986797332764, "reward_std": 0.3732859492301941, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5293987393379211, "rewards/MLPCodeOnPolicy32BORM/std": 0.40485361218452454, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3308.0, "completions/mean_length": 1212.421875, "completions/min_length": 462.0, "epoch": 0.7687074829931972, "frac_reward_zero_std": 0.0, "grad_norm": 0.12740333260958453, "kl": 0.001389774693052459, "learning_rate": 6.439449772171162e-08, "loss": 1.3522803783416748e-06, "num_turns": 2.0, "reward": 0.5247737169265747, "reward_std": 0.31403160095214844, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5247736573219299, "rewards/MLPCodeOnPolicy32BORM/std": 0.3341880440711975, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4311.0, "completions/mean_length": 1256.6953125, "completions/min_length": 519.0, "epoch": 0.7721088435374149, "frac_reward_zero_std": 0.0, "grad_norm": 0.1298841142351844, "kl": 0.0013443967050079664, "learning_rate": 6.259722605560488e-08, "loss": 1.4165416359901428e-06, "num_turns": 2.0, "reward": 0.515049934387207, "reward_std": 0.43085336685180664, "rewards/MLPCodeOnPolicy32BORM/mean": 0.515049934387207, "rewards/MLPCodeOnPolicy32BORM/std": 0.4393814206123352, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4830.0, "completions/mean_length": 1255.2890625, "completions/min_length": 448.0, "epoch": 0.7755102040816326, "frac_reward_zero_std": 0.0, "grad_norm": 0.12858098027696746, "kl": 0.0012866992628914886, "learning_rate": 6.082179604557616e-08, "loss": 1.3224780559539795e-06, "num_turns": 2.0, "reward": 0.5552853345870972, "reward_std": 0.34694957733154297, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5552853345870972, "rewards/MLPCodeOnPolicy32BORM/std": 0.36402466893196106, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3143.0, "completions/mean_length": 1254.99609375, "completions/min_length": 450.0, "epoch": 0.7789115646258503, "frac_reward_zero_std": 0.0, "grad_norm": 0.12893806348706394, "kl": 0.0012813517714675982, "learning_rate": 5.9068414616693266e-08, "loss": 1.2330710887908936e-06, "num_turns": 2.0, "reward": 0.46818554401397705, "reward_std": 0.3551763892173767, "rewards/MLPCodeOnPolicy32BORM/mean": 0.46818554401397705, "rewards/MLPCodeOnPolicy32BORM/std": 0.36898866295814514, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3138.0, "completions/mean_length": 1278.76171875, "completions/min_length": 507.0, "epoch": 0.782312925170068, "frac_reward_zero_std": 0.0, "grad_norm": 0.12421445570047403, "kl": 0.0013655927568834159, "learning_rate": 5.733728612427771e-08, "loss": 1.5338882803916931e-06, "num_turns": 2.0, "reward": 0.6461366415023804, "reward_std": 0.33448177576065063, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6461366415023804, "rewards/MLPCodeOnPolicy32BORM/std": 0.3466874659061432, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1294.75, "completions/min_length": 484.0, "epoch": 0.7857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.12127892715668272, "kl": 0.0013997323094372405, "learning_rate": 5.5628612330087724e-08, "loss": 1.4491379261016846e-06, "num_turns": 2.0, "reward": 0.5366698503494263, "reward_std": 0.410519540309906, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5366698503494263, "rewards/MLPCodeOnPolicy32BORM/std": 0.42046892642974854, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3791.0, "completions/mean_length": 1275.3515625, "completions/min_length": 488.0, "epoch": 0.7891156462585034, "frac_reward_zero_std": 0.0, "grad_norm": 0.12653282121251824, "kl": 0.001317089796430082, "learning_rate": 5.394259237880272e-08, "loss": 1.3327226042747498e-06, "num_turns": 2.0, "reward": 0.5845522880554199, "reward_std": 0.31545960903167725, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5845522284507751, "rewards/MLPCodeOnPolicy32BORM/std": 0.3370753824710846, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5526.0, "completions/mean_length": 1302.609375, "completions/min_length": 542.0, "epoch": 0.7925170068027211, "frac_reward_zero_std": 0.0, "grad_norm": 0.13048198986020212, "kl": 0.0012734234924209886, "learning_rate": 5.227942277481362e-08, "loss": 1.296401023864746e-06, "num_turns": 2.0, "reward": 0.41471242904663086, "reward_std": 0.4190084636211395, "rewards/MLPCodeOnPolicy32BORM/mean": 0.41471242904663086, "rewards/MLPCodeOnPolicy32BORM/std": 0.42466604709625244, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3382.0, "completions/mean_length": 1286.8125, "completions/min_length": 548.0, "epoch": 0.7959183673469388, "frac_reward_zero_std": 0.0, "grad_norm": 0.1275063488377491, "kl": 0.0013080939488645527, "learning_rate": 5.0639297359319846e-08, "loss": 1.3969838619232178e-06, "num_turns": 2.0, "reward": 0.5419309139251709, "reward_std": 0.4084588289260864, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5419309139251709, "rewards/MLPCodeOnPolicy32BORM/std": 0.42553380131721497, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3599.0, "completions/mean_length": 1292.29296875, "completions/min_length": 517.0, "epoch": 0.7993197278911565, "frac_reward_zero_std": 0.0, "grad_norm": 0.12339984082764446, "kl": 0.0012895453537566937, "learning_rate": 4.902240728773749e-08, "loss": 1.2051314115524292e-06, "num_turns": 2.0, "reward": 0.5682984590530396, "reward_std": 0.2678278088569641, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5682984590530396, "rewards/MLPCodeOnPolicy32BORM/std": 0.29323381185531616, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3072.0, "completions/mean_length": 1364.32421875, "completions/min_length": 507.0, "epoch": 0.8027210884353742, "frac_reward_zero_std": 0.0, "grad_norm": 0.12114741624083838, "kl": 0.00119225540493062, "learning_rate": 4.742894100742062e-08, "loss": 1.1418014764785767e-06, "num_turns": 2.0, "reward": 0.41127753257751465, "reward_std": 0.40841516852378845, "rewards/MLPCodeOnPolicy32BORM/mean": 0.41127756237983704, "rewards/MLPCodeOnPolicy32BORM/std": 0.4287666976451874, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3509.0, "completions/mean_length": 1307.91015625, "completions/min_length": 449.0, "epoch": 0.8061224489795918, "frac_reward_zero_std": 0.0, "grad_norm": 0.1332653805549814, "kl": 0.0013080557828288875, "learning_rate": 4.5859084235697235e-08, "loss": 1.3187527656555176e-06, "num_turns": 2.0, "reward": 0.48149698972702026, "reward_std": 0.43754392862319946, "rewards/MLPCodeOnPolicy32BORM/mean": 0.48149701952934265, "rewards/MLPCodeOnPolicy32BORM/std": 0.44631412625312805, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5070.0, "completions/mean_length": 1316.9921875, "completions/min_length": 465.0, "epoch": 0.8095238095238095, "frac_reward_zero_std": 0.0, "grad_norm": 0.12326346970337854, "kl": 0.0012852595964432112, "learning_rate": 4.43130199382247e-08, "loss": 1.4621764421463013e-06, "num_turns": 2.0, "reward": 0.550597071647644, "reward_std": 0.4328611493110657, "rewards/MLPCodeOnPolicy32BORM/mean": 0.550597071647644, "rewards/MLPCodeOnPolicy32BORM/std": 0.4346959590911865, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6591.0, "completions/mean_length": 1328.93359375, "completions/min_length": 456.0, "epoch": 0.8129251700680272, "frac_reward_zero_std": 0.0, "grad_norm": 0.12092938068649253, "kl": 0.0012998831398363109, "learning_rate": 4.2790928307664706e-08, "loss": 1.2703239917755127e-06, "num_turns": 2.0, "reward": 0.4453105926513672, "reward_std": 0.48312580585479736, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4453105926513672, "rewards/MLPCodeOnPolicy32BORM/std": 0.4780498147010803, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3332.0, "completions/mean_length": 1213.22265625, "completions/min_length": 509.0, "epoch": 0.8163265306122449, "frac_reward_zero_std": 0.0, "grad_norm": 0.1316405251458809, "kl": 0.0013398944165601279, "learning_rate": 4.1292986742682254e-08, "loss": 1.3709068298339844e-06, "num_turns": 2.0, "reward": 0.6118814945220947, "reward_std": 0.30538809299468994, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6118814945220947, "rewards/MLPCodeOnPolicy32BORM/std": 0.34404152631759644, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2994.0, "completions/mean_length": 1192.1640625, "completions/min_length": 445.0, "epoch": 0.8197278911564626, "frac_reward_zero_std": 0.0, "grad_norm": 0.12208372574845405, "kl": 0.0014582149669877253, "learning_rate": 3.98193698272698e-08, "loss": 1.3541430234909058e-06, "num_turns": 2.0, "reward": 0.5893775820732117, "reward_std": 0.36501747369766235, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5893775820732117, "rewards/MLPCodeOnPolicy32BORM/std": 0.3986145853996277, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4604.0, "completions/mean_length": 1244.73828125, "completions/min_length": 346.0, "epoch": 0.8231292517006803, "frac_reward_zero_std": 0.0, "grad_norm": 0.1272060823706791, "kl": 0.0013787021653115517, "learning_rate": 3.837024931039995e-08, "loss": 1.4100223779678345e-06, "num_turns": 2.0, "reward": 0.5549571514129639, "reward_std": 0.40352365374565125, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5549571514129639, "rewards/MLPCodeOnPolicy32BORM/std": 0.407723993062973, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4099.0, "completions/mean_length": 1194.7421875, "completions/min_length": 467.0, "epoch": 0.826530612244898, "frac_reward_zero_std": 0.0, "grad_norm": 0.13157085650901407, "kl": 0.0014077305258979322, "learning_rate": 3.6945794086007705e-08, "loss": 1.2433156371116638e-06, "num_turns": 2.0, "reward": 0.6976893544197083, "reward_std": 0.2876189947128296, "rewards/MLPCodeOnPolicy32BORM/mean": 0.697689414024353, "rewards/MLPCodeOnPolicy32BORM/std": 0.33244314789772034, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4483.0, "completions/mean_length": 1337.625, "completions/min_length": 521.0, "epoch": 0.8299319727891157, "frac_reward_zero_std": 0.0, "grad_norm": 0.1207531264113648, "kl": 0.0013017231822232134, "learning_rate": 3.5546170173306436e-08, "loss": 1.3485550880432129e-06, "num_turns": 2.0, "reward": 0.5120729207992554, "reward_std": 0.3624032437801361, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5120729207992554, "rewards/MLPCodeOnPolicy32BORM/std": 0.3639506995677948, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3035.0, "completions/mean_length": 1198.40625, "completions/min_length": 474.0, "epoch": 0.8333333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 0.12806495998767994, "kl": 0.0014157043924569734, "learning_rate": 3.4171540697438355e-08, "loss": 1.3671815395355225e-06, "num_turns": 2.0, "reward": 0.5975139737129211, "reward_std": 0.37816518545150757, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5975139737129211, "rewards/MLPCodeOnPolicy32BORM/std": 0.39434632658958435, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1249.2265625, "completions/min_length": 507.0, "epoch": 0.8367346938775511, "frac_reward_zero_std": 0.0, "grad_norm": 0.12522900942275697, "kl": 0.0013556128133132006, "learning_rate": 3.2822065870462215e-08, "loss": 1.4379620552062988e-06, "num_turns": 2.0, "reward": 0.4996965825557709, "reward_std": 0.4283973276615143, "rewards/MLPCodeOnPolicy32BORM/mean": 0.49969661235809326, "rewards/MLPCodeOnPolicy32BORM/std": 0.44312095642089844, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3184.0, "completions/mean_length": 1268.6171875, "completions/min_length": 536.0, "epoch": 0.8401360544217688, "frac_reward_zero_std": 0.0, "grad_norm": 0.12919194681083224, "kl": 0.0012587565233843634, "learning_rate": 3.149790297268107e-08, "loss": 1.3522803783416748e-06, "num_turns": 2.0, "reward": 0.5647189021110535, "reward_std": 0.40422070026397705, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5647189021110535, "rewards/MLPCodeOnPolicy32BORM/std": 0.42468276619911194, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2953.0, "completions/mean_length": 1125.83203125, "completions/min_length": 508.0, "epoch": 0.8435374149659864, "frac_reward_zero_std": 0.0, "grad_norm": 0.12836800954696548, "kl": 0.0014017259263710002, "learning_rate": 3.0199206334310945e-08, "loss": 1.298263669013977e-06, "num_turns": 2.0, "reward": 0.6107585430145264, "reward_std": 0.3228145241737366, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6107584834098816, "rewards/MLPCodeOnPolicy32BORM/std": 0.3467032015323639, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4479.0, "completions/mean_length": 1233.171875, "completions/min_length": 384.0, "epoch": 0.8469387755102041, "frac_reward_zero_std": 0.0, "grad_norm": 0.12577691381845763, "kl": 0.0013851264275217545, "learning_rate": 2.892612731749414e-08, "loss": 1.2442469596862793e-06, "num_turns": 2.0, "reward": 0.5270353555679321, "reward_std": 0.40185391902923584, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5270353555679321, "rewards/MLPCodeOnPolicy32BORM/std": 0.4054703414440155, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2801.0, "completions/mean_length": 1164.140625, "completions/min_length": 448.0, "epoch": 0.8503401360544217, "frac_reward_zero_std": 0.0, "grad_norm": 0.13561042447797822, "kl": 0.0014222385561879491, "learning_rate": 2.7678814298657732e-08, "loss": 1.475214958190918e-06, "num_turns": 2.0, "reward": 0.5727814435958862, "reward_std": 0.2735365629196167, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5727814435958862, "rewards/MLPCodeOnPolicy32BORM/std": 0.3202778697013855, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4152.0, "completions/mean_length": 1262.21875, "completions/min_length": 469.0, "epoch": 0.8537414965986394, "frac_reward_zero_std": 0.0, "grad_norm": 0.1271199876604987, "kl": 0.0013093487041260232, "learning_rate": 2.6457412651220895e-08, "loss": 1.4081597328186035e-06, "num_turns": 2.0, "reward": 0.5430048704147339, "reward_std": 0.38867905735969543, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5430048704147339, "rewards/MLPCodeOnPolicy32BORM/std": 0.41183096170425415, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4171.0, "completions/mean_length": 1299.67578125, "completions/min_length": 449.0, "epoch": 0.8571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.1304441357703922, "kl": 0.0013141680137778167, "learning_rate": 2.5262064728651194e-08, "loss": 1.3150274753570557e-06, "num_turns": 2.0, "reward": 0.5142631530761719, "reward_std": 0.3517936170101166, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5142631530761719, "rewards/MLPCodeOnPolicy32BORM/std": 0.36099955439567566, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3958.0, "completions/mean_length": 1306.7109375, "completions/min_length": 468.0, "epoch": 0.8605442176870748, "frac_reward_zero_std": 0.0, "grad_norm": 0.1316901048192095, "kl": 0.0013079443942842772, "learning_rate": 2.409290984787371e-08, "loss": 1.3560056686401367e-06, "num_turns": 2.0, "reward": 0.5310392379760742, "reward_std": 0.3759360909461975, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5310392379760742, "rewards/MLPCodeOnPolicy32BORM/std": 0.3867994248867035, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2512.0, "completions/mean_length": 1135.703125, "completions/min_length": 434.0, "epoch": 0.8639455782312925, "frac_reward_zero_std": 0.0, "grad_norm": 0.1231556725021594, "kl": 0.001387271031489945, "learning_rate": 2.2950084273033633e-08, "loss": 1.4193356037139893e-06, "num_turns": 2.0, "reward": 0.6066977977752686, "reward_std": 0.2876471281051636, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6066977977752686, "rewards/MLPCodeOnPolicy32BORM/std": 0.31261515617370605, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3405.0, "completions/mean_length": 1179.4453125, "completions/min_length": 461.0, "epoch": 0.8673469387755102, "frac_reward_zero_std": 0.0, "grad_norm": 0.13542018475020984, "kl": 0.0013846461633875151, "learning_rate": 2.183372119961499e-08, "loss": 1.3578683137893677e-06, "num_turns": 2.0, "reward": 0.4826492369174957, "reward_std": 0.37893763184547424, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4826492369174957, "rewards/MLPCodeOnPolicy32BORM/std": 0.3863247334957123, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3682.0, "completions/mean_length": 1303.90625, "completions/min_length": 515.0, "epoch": 0.8707482993197279, "frac_reward_zero_std": 0.0, "grad_norm": 0.12642203725075352, "kl": 0.0013932072433817666, "learning_rate": 2.074395073891644e-08, "loss": 1.3336539268493652e-06, "num_turns": 2.0, "reward": 0.5473675727844238, "reward_std": 0.3467334508895874, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5473675727844238, "rewards/MLPCodeOnPolicy32BORM/std": 0.3749827444553375, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3564.0, "completions/mean_length": 1240.109375, "completions/min_length": 477.0, "epoch": 0.8741496598639455, "frac_reward_zero_std": 0.0, "grad_norm": 0.1325814674676704, "kl": 0.0013994931805427768, "learning_rate": 1.9680899902887266e-08, "loss": 1.4491379261016846e-06, "num_turns": 2.0, "reward": 0.6287966966629028, "reward_std": 0.2795559763908386, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6287966966629028, "rewards/MLPCodeOnPolicy32BORM/std": 0.3080456852912903, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4612.0, "completions/mean_length": 1310.80078125, "completions/min_length": 479.0, "epoch": 0.8775510204081632, "frac_reward_zero_std": 0.0, "grad_norm": 0.1273096835305895, "kl": 0.0012589961306730402, "learning_rate": 1.8644692589323967e-08, "loss": 1.2870877981185913e-06, "num_turns": 2.0, "reward": 0.4979984760284424, "reward_std": 0.4460023045539856, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4979984760284424, "rewards/MLPCodeOnPolicy32BORM/std": 0.4358255863189697, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3290.0, "completions/mean_length": 1306.72265625, "completions/min_length": 463.0, "epoch": 0.8809523809523809, "frac_reward_zero_std": 0.0, "grad_norm": 0.12743739498262194, "kl": 0.0013438629566735472, "learning_rate": 1.7635449567430183e-08, "loss": 1.2833625078201294e-06, "num_turns": 2.0, "reward": 0.5485579967498779, "reward_std": 0.3817247748374939, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5485579967498779, "rewards/MLPCodeOnPolicy32BORM/std": 0.3858395516872406, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3359.0, "completions/mean_length": 1184.99609375, "completions/min_length": 479.0, "epoch": 0.8843537414965986, "frac_reward_zero_std": 0.0, "grad_norm": 0.1264194885841443, "kl": 0.0014367117000801954, "learning_rate": 1.6653288463741062e-08, "loss": 1.5553086996078491e-06, "num_turns": 2.0, "reward": 0.5498301982879639, "reward_std": 0.376219242811203, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5498301982879639, "rewards/MLPCodeOnPolicy32BORM/std": 0.39297208189964294, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3728.0, "completions/mean_length": 1216.14453125, "completions/min_length": 491.0, "epoch": 0.8877551020408163, "frac_reward_zero_std": 0.0, "grad_norm": 0.12428839600576591, "kl": 0.0013569434822784388, "learning_rate": 1.5698323748414122e-08, "loss": 1.3709068298339844e-06, "num_turns": 2.0, "reward": 0.6100364923477173, "reward_std": 0.3211628198623657, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6100364923477173, "rewards/MLPCodeOnPolicy32BORM/std": 0.3399140238761902, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2995.0, "completions/mean_length": 1211.3984375, "completions/min_length": 445.0, "epoch": 0.891156462585034, "frac_reward_zero_std": 0.0, "grad_norm": 0.13142807112400828, "kl": 0.001364831018690893, "learning_rate": 1.4770666721887621e-08, "loss": 1.3671815395355225e-06, "num_turns": 2.0, "reward": 0.431456983089447, "reward_std": 0.4261000454425812, "rewards/MLPCodeOnPolicy32BORM/mean": 0.431456983089447, "rewards/MLPCodeOnPolicy32BORM/std": 0.42020177841186523, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3240.0, "completions/mean_length": 1270.296875, "completions/min_length": 494.0, "epoch": 0.8945578231292517, "frac_reward_zero_std": 0.0, "grad_norm": 0.12432611658097836, "kl": 0.0013498828893716563, "learning_rate": 1.3870425501908672e-08, "loss": 1.3872049748897552e-06, "num_turns": 2.0, "reward": 0.3706500828266144, "reward_std": 0.4424039423465729, "rewards/MLPCodeOnPolicy32BORM/mean": 0.37065011262893677, "rewards/MLPCodeOnPolicy32BORM/std": 0.4383034110069275, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3257.0, "completions/mean_length": 1216.11328125, "completions/min_length": 497.0, "epoch": 0.8979591836734694, "frac_reward_zero_std": 0.0, "grad_norm": 0.12923448726401368, "kl": 0.001308337137743365, "learning_rate": 1.2997705010932391e-08, "loss": 1.3634562492370605e-06, "num_turns": 2.0, "reward": 0.583458662033081, "reward_std": 0.4207555055618286, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5834586024284363, "rewards/MLPCodeOnPolicy32BORM/std": 0.4383680522441864, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3966.0, "completions/mean_length": 1267.04296875, "completions/min_length": 462.0, "epoch": 0.9013605442176871, "frac_reward_zero_std": 0.0, "grad_norm": 0.12273271242830487, "kl": 0.0013095537397020962, "learning_rate": 1.2152606963892863e-08, "loss": 1.298263669013977e-06, "num_turns": 2.0, "reward": 0.5434870719909668, "reward_std": 0.3962419033050537, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5434870719909668, "rewards/MLPCodeOnPolicy32BORM/std": 0.40210050344467163, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/mean_length": 1221.6484375, "completions/min_length": 498.0, "epoch": 0.9047619047619048, "frac_reward_zero_std": 0.0, "grad_norm": 0.1373786488327759, "kl": 0.0013954197893326636, "learning_rate": 1.1335229856348689e-08, "loss": 1.4295801520347595e-06, "num_turns": 2.0, "reward": 0.4984479248523712, "reward_std": 0.336830735206604, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4984479546546936, "rewards/MLPCodeOnPolicy32BORM/std": 0.377440869808197, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4732.0, "completions/mean_length": 1256.984375, "completions/min_length": 511.0, "epoch": 0.9081632653061225, "frac_reward_zero_std": 0.0, "grad_norm": 0.13037104352607035, "kl": 0.0012341619449216523, "learning_rate": 1.054566895300324e-08, "loss": 1.2405216693878174e-06, "num_turns": 2.0, "reward": 0.33690741658210754, "reward_std": 0.4263767600059509, "rewards/MLPCodeOnPolicy32BORM/mean": 0.33690741658210754, "rewards/MLPCodeOnPolicy32BORM/std": 0.43096432089805603, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2919.0, "completions/mean_length": 1166.61328125, "completions/min_length": 383.0, "epoch": 0.9115646258503401, "frac_reward_zero_std": 0.0, "grad_norm": 0.13027755451241033, "kl": 0.0013724857608394814, "learning_rate": 9.784016276601609e-09, "loss": 1.3690441846847534e-06, "num_turns": 2.0, "reward": 0.5407834053039551, "reward_std": 0.3101251721382141, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5407834053039551, "rewards/MLPCodeOnPolicy32BORM/std": 0.34196120500564575, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2649.0, "completions/mean_length": 1170.43359375, "completions/min_length": 470.0, "epoch": 0.9149659863945578, "frac_reward_zero_std": 0.0, "grad_norm": 0.12799678385136565, "kl": 0.0014380482898559421, "learning_rate": 9.050360597205513e-09, "loss": 1.475214958190918e-06, "num_turns": 2.0, "reward": 0.6152033805847168, "reward_std": 0.3536611795425415, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6152033805847168, "rewards/MLPCodeOnPolicy32BORM/std": 0.3689250349998474, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4481.0, "completions/mean_length": 1223.59375, "completions/min_length": 460.0, "epoch": 0.9183673469387755, "frac_reward_zero_std": 0.0, "grad_norm": 0.1286436267395403, "kl": 0.0014034728528713458, "learning_rate": 8.344787421847216e-09, "loss": 1.4491379261016846e-06, "num_turns": 2.0, "reward": 0.5233062505722046, "reward_std": 0.37922120094299316, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5233062505722046, "rewards/MLPCodeOnPolicy32BORM/std": 0.4167995750904083, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3936.0, "completions/mean_length": 1185.27734375, "completions/min_length": 409.0, "epoch": 0.9217687074829932, "frac_reward_zero_std": 0.0, "grad_norm": 0.13565037787124126, "kl": 0.0014686701588288997, "learning_rate": 7.667378984563599e-09, "loss": 1.4975666999816895e-06, "num_turns": 2.0, "reward": 0.6511124968528748, "reward_std": 0.3759189546108246, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6511124968528748, "rewards/MLPCodeOnPolicy32BORM/std": 0.37731456756591797, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3347.0, "completions/mean_length": 1224.71875, "completions/min_length": 511.0, "epoch": 0.9251700680272109, "frac_reward_zero_std": 0.0, "grad_norm": 0.13122739068113443, "kl": 0.0014554602657881333, "learning_rate": 7.018214236812009e-09, "loss": 1.4603137969970703e-06, "num_turns": 2.0, "reward": 0.6520252823829651, "reward_std": 0.3065425753593445, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6520252823829651, "rewards/MLPCodeOnPolicy32BORM/std": 0.336844801902771, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 8193.0, "completions/mean_length": 1329.84765625, "completions/min_length": 408.0, "epoch": 0.9285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.12969250322344092, "kl": 0.0013099484158374253, "learning_rate": 6.397368838268496e-09, "loss": 1.3224780559539795e-06, "num_turns": 2.0, "reward": 0.4384145736694336, "reward_std": 0.4246937334537506, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4384145736694336, "rewards/MLPCodeOnPolicy32BORM/std": 0.43505164980888367, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3074.0, "completions/mean_length": 1172.4453125, "completions/min_length": 414.0, "epoch": 0.9319727891156463, "frac_reward_zero_std": 0.0, "grad_norm": 0.12746256958452673, "kl": 0.0015114195266505703, "learning_rate": 5.80491514800957e-09, "loss": 1.543201506137848e-06, "num_turns": 2.0, "reward": 0.5098080635070801, "reward_std": 0.4241393804550171, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5098081827163696, "rewards/MLPCodeOnPolicy32BORM/std": 0.43560436367988586, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3518.0, "completions/mean_length": 1221.8046875, "completions/min_length": 476.0, "epoch": 0.935374149659864, "frac_reward_zero_std": 0.0, "grad_norm": 0.12529544378366023, "kl": 0.001437538394384319, "learning_rate": 5.24092221607908e-09, "loss": 1.4826655387878418e-06, "num_turns": 2.0, "reward": 0.4893878102302551, "reward_std": 0.40568646788597107, "rewards/MLPCodeOnPolicy32BORM/mean": 0.4893878400325775, "rewards/MLPCodeOnPolicy32BORM/std": 0.4118155241012573, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6452.0, "completions/mean_length": 1276.9453125, "completions/min_length": 459.0, "epoch": 0.9387755102040817, "frac_reward_zero_std": 0.0, "grad_norm": 0.13384851249165047, "kl": 0.0013293683077790774, "learning_rate": 4.705455775440237e-09, "loss": 1.1986121535301208e-06, "num_turns": 2.0, "reward": 0.47771668434143066, "reward_std": 0.40792709589004517, "rewards/MLPCodeOnPolicy32BORM/mean": 0.47771668434143066, "rewards/MLPCodeOnPolicy32BORM/std": 0.42754605412483215, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3401.0, "completions/mean_length": 1260.4296875, "completions/min_length": 402.0, "epoch": 0.9421768707482994, "frac_reward_zero_std": 0.0, "grad_norm": 0.12174752100098382, "kl": 0.0013493520209522103, "learning_rate": 4.198578234314604e-09, "loss": 1.2908130884170532e-06, "num_turns": 2.0, "reward": 0.5614825487136841, "reward_std": 0.33593881130218506, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5614825487136841, "rewards/MLPCodeOnPolicy32BORM/std": 0.3463003933429718, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4036.0, "completions/mean_length": 1229.71875, "completions/min_length": 502.0, "epoch": 0.9455782312925171, "frac_reward_zero_std": 0.0, "grad_norm": 0.12422464239414316, "kl": 0.0013207711699578795, "learning_rate": 3.720348668908385e-09, "loss": 1.4547258615493774e-06, "num_turns": 2.0, "reward": 0.6281092762947083, "reward_std": 0.35857096314430237, "rewards/MLPCodeOnPolicy32BORM/mean": 0.628109335899353, "rewards/MLPCodeOnPolicy32BORM/std": 0.35941147804260254, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2924.0, "completions/mean_length": 1186.828125, "completions/min_length": 534.0, "epoch": 0.9489795918367347, "frac_reward_zero_std": 0.0, "grad_norm": 0.12632069336817253, "kl": 0.0014034768064448144, "learning_rate": 3.2708228165273244e-09, "loss": 1.2293457984924316e-06, "num_turns": 2.0, "reward": 0.5960803031921387, "reward_std": 0.29391491413116455, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5960803031921387, "rewards/MLPCodeOnPolicy32BORM/std": 0.3531346917152405, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8193.0, "completions/mean_length": 1162.16015625, "completions/min_length": 452.0, "epoch": 0.9523809523809523, "frac_reward_zero_std": 0.0, "grad_norm": 0.12842830063820992, "kl": 0.001470525316108251, "learning_rate": 2.850053069080344e-09, "loss": 1.4062970876693726e-06, "num_turns": 2.0, "reward": 0.5352786779403687, "reward_std": 0.3336600661277771, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5352786779403687, "rewards/MLPCodeOnPolicy32BORM/std": 0.3443100154399872, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3791.0, "completions/mean_length": 1248.890625, "completions/min_length": 546.0, "epoch": 0.95578231292517, "frac_reward_zero_std": 0.0, "grad_norm": 0.12484503942889262, "kl": 0.0013901375514251413, "learning_rate": 2.458088466973346e-09, "loss": 1.3709068298339844e-06, "num_turns": 2.0, "reward": 0.5107653141021729, "reward_std": 0.38533347845077515, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5107653141021729, "rewards/MLPCodeOnPolicy32BORM/std": 0.3988993465900421, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3588.0, "completions/mean_length": 1303.5703125, "completions/min_length": 506.0, "epoch": 0.9591836734693877, "frac_reward_zero_std": 0.0, "grad_norm": 0.12427816786444691, "kl": 0.0013491953013726743, "learning_rate": 2.094974693393731e-09, "loss": 1.3280659914016724e-06, "num_turns": 2.0, "reward": 0.5057296752929688, "reward_std": 0.35366684198379517, "rewards/MLPCodeOnPolicy32BORM/mean": 0.505729615688324, "rewards/MLPCodeOnPolicy32BORM/std": 0.3649117350578308, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3250.0, "completions/mean_length": 1188.65625, "completions/min_length": 475.0, "epoch": 0.9625850340136054, "frac_reward_zero_std": 0.0, "grad_norm": 0.12292104991319937, "kl": 0.0013688728076886036, "learning_rate": 1.7607540689859035e-09, "loss": 1.3783574104309082e-06, "num_turns": 2.0, "reward": 0.47922593355178833, "reward_std": 0.37873101234436035, "rewards/MLPCodeOnPolicy32BORM/mean": 0.47922593355178833, "rewards/MLPCodeOnPolicy32BORM/std": 0.39812812209129333, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2948.0, "completions/mean_length": 1189.20703125, "completions/min_length": 475.0, "epoch": 0.9659863945578231, "frac_reward_zero_std": 0.0, "grad_norm": 0.12510495565401736, "kl": 0.0013413819378911285, "learning_rate": 1.4554655469189437e-09, "loss": 1.3094395399093628e-06, "num_turns": 2.0, "reward": 0.37418410181999207, "reward_std": 0.4322727918624878, "rewards/MLPCodeOnPolicy32BORM/mean": 0.37418413162231445, "rewards/MLPCodeOnPolicy32BORM/std": 0.42694520950317383, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3314.0, "completions/mean_length": 1233.1796875, "completions/min_length": 434.0, "epoch": 0.9693877551020408, "frac_reward_zero_std": 0.0, "grad_norm": 0.12785761769933177, "kl": 0.0013754392584814923, "learning_rate": 1.1791447083465133e-09, "loss": 1.3709068298339844e-06, "num_turns": 2.0, "reward": 0.5906023383140564, "reward_std": 0.27952495217323303, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5906022787094116, "rewards/MLPCodeOnPolicy32BORM/std": 0.29190781712532043, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2930.0, "completions/mean_length": 1205.79296875, "completions/min_length": 426.0, "epoch": 0.9727891156462585, "frac_reward_zero_std": 0.0, "grad_norm": 0.1261298576091256, "kl": 0.001388487350595824, "learning_rate": 9.318237582600086e-10, "loss": 1.4435499906539917e-06, "num_turns": 2.0, "reward": 0.6461822986602783, "reward_std": 0.32930630445480347, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6461822986602783, "rewards/MLPCodeOnPolicy32BORM/std": 0.3550751507282257, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3610.0, "completions/mean_length": 1332.69140625, "completions/min_length": 566.0, "epoch": 0.9761904761904762, "frac_reward_zero_std": 0.0, "grad_norm": 0.1240331232588916, "kl": 0.0012782484163835761, "learning_rate": 7.135315217350891e-10, "loss": 1.3243407011032104e-06, "num_turns": 2.0, "reward": 0.6077274084091187, "reward_std": 0.29934900999069214, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6077274084091187, "rewards/MLPCodeOnPolicy32BORM/std": 0.32556846737861633, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5343.0, "completions/mean_length": 1273.13671875, "completions/min_length": 515.0, "epoch": 0.9795918367346939, "frac_reward_zero_std": 0.0, "grad_norm": 0.1288140416013684, "kl": 0.0013770640471193474, "learning_rate": 5.242934405720878e-10, "loss": 1.5227124094963074e-06, "num_turns": 2.0, "reward": 0.552533745765686, "reward_std": 0.37712985277175903, "rewards/MLPCodeOnPolicy32BORM/mean": 0.552533745765686, "rewards/MLPCodeOnPolicy32BORM/std": 0.3899850845336914, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2891.0, "completions/mean_length": 1344.17578125, "completions/min_length": 579.0, "epoch": 0.9829931972789115, "frac_reward_zero_std": 0.0, "grad_norm": 0.12675166680813696, "kl": 0.001224989002366783, "learning_rate": 3.6413157033077234e-10, "loss": 1.2069940567016602e-06, "num_turns": 2.0, "reward": 0.48343604803085327, "reward_std": 0.3913576304912567, "rewards/MLPCodeOnPolicy32BORM/mean": 0.48343604803085327, "rewards/MLPCodeOnPolicy32BORM/std": 0.3972838819026947, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3515.0, "completions/mean_length": 1218.1875, "completions/min_length": 494.0, "epoch": 0.9863945578231292, "frac_reward_zero_std": 0.0, "grad_norm": 0.12321595926218833, "kl": 0.0013713659063796513, "learning_rate": 2.3306457775981727e-10, "loss": 1.3820827007293701e-06, "num_turns": 2.0, "reward": 0.5790843963623047, "reward_std": 0.24808508157730103, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5790843963623047, "rewards/MLPCodeOnPolicy32BORM/std": 0.27677780389785767, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4311.0, "completions/mean_length": 1273.83203125, "completions/min_length": 515.0, "epoch": 0.9897959183673469, "frac_reward_zero_std": 0.0, "grad_norm": 0.1313147159889535, "kl": 0.0014206118667061673, "learning_rate": 1.3110773862126667e-10, "loss": 1.4603137969970703e-06, "num_turns": 2.0, "reward": 0.5936909914016724, "reward_std": 0.3352019786834717, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5936910510063171, "rewards/MLPCodeOnPolicy32BORM/std": 0.34831205010414124, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3531.0, "completions/mean_length": 1268.19921875, "completions/min_length": 497.0, "epoch": 0.9931972789115646, "frac_reward_zero_std": 0.0, "grad_norm": 0.125741452166242, "kl": 0.0013244858291727724, "learning_rate": 5.827293591006976e-11, "loss": 1.3262033462524414e-06, "num_turns": 2.0, "reward": 0.41567426919937134, "reward_std": 0.41647863388061523, "rewards/MLPCodeOnPolicy32BORM/mean": 0.41567426919937134, "rewards/MLPCodeOnPolicy32BORM/std": 0.4198899269104004, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2584.0, "completions/mean_length": 1112.0234375, "completions/min_length": 421.0, "epoch": 0.9965986394557823, "frac_reward_zero_std": 0.0, "grad_norm": 0.13234971889912275, "kl": 0.0014827251252427232, "learning_rate": 1.456865846913291e-11, "loss": 1.3709068298339844e-06, "num_turns": 2.0, "reward": 0.6423551440238953, "reward_std": 0.35404038429260254, "rewards/MLPCodeOnPolicy32BORM/mean": 0.6423551440238953, "rewards/MLPCodeOnPolicy32BORM/std": 0.3747543692588806, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3105.0, "completions/mean_length": 1199.40234375, "completions/min_length": 494.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.1285475179466372, "kl": 0.0014067331903788727, "learning_rate": 0.0, "loss": 1.391395926475525e-06, "num_turns": 2.0, "reward": 0.5415540933609009, "reward_std": 0.35335230827331543, "rewards/MLPCodeOnPolicy32BORM/mean": 0.5415540933609009, "rewards/MLPCodeOnPolicy32BORM/std": 0.374896377325058, "step": 294 } ], "logging_steps": 1, "max_steps": 294, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }