| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500.0, | |
| "global_step": 294, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4758.0, | |
| "completions/mean_length": 1819.5078125, | |
| "completions/min_length": 508.0, | |
| "epoch": 0.003401360544217687, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12744246988948524, | |
| "kl": 0.0, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": -3.3527612686157227e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.34577980637550354, | |
| "reward_std": 0.4898218512535095, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.34577980637550354, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.49940040707588196, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5835.0, | |
| "completions/mean_length": 1820.296875, | |
| "completions/min_length": 640.0, | |
| "epoch": 0.006802721088435374, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1201274689767808, | |
| "kl": 0.0, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": -9.033828973770142e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.5030246376991272, | |
| "reward_std": 0.45653027296066284, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.503024697303772, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4683470129966736, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6221.0, | |
| "completions/mean_length": 1869.0234375, | |
| "completions/min_length": 625.0, | |
| "epoch": 0.01020408163265306, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13249851266483276, | |
| "kl": 5.7374833659196156e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 1.6391277313232422e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4454033672809601, | |
| "reward_std": 0.4615671932697296, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.44540339708328247, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4942783713340759, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4441.0, | |
| "completions/mean_length": 1784.91015625, | |
| "completions/min_length": 804.0, | |
| "epoch": 0.013605442176870748, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12573995450643682, | |
| "kl": 5.082826066882262e-05, | |
| "learning_rate": 4.999854313415308e-07, | |
| "loss": 5.587935447692871e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.5703479051589966, | |
| "reward_std": 0.36449864506721497, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5703479051589966, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3774077594280243, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5988.0, | |
| "completions/mean_length": 1741.33984375, | |
| "completions/min_length": 576.0, | |
| "epoch": 0.017006802721088437, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1373648881605736, | |
| "kl": 5.534328431622271e-05, | |
| "learning_rate": 4.999417270640898e-07, | |
| "loss": 6.332993507385254e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.33942678570747375, | |
| "reward_std": 0.5093421936035156, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.33942678570747375, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.5255665183067322, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6849.0, | |
| "completions/mean_length": 1795.203125, | |
| "completions/min_length": 579.0, | |
| "epoch": 0.02040816326530612, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12060672847616874, | |
| "kl": 4.775961713221477e-05, | |
| "learning_rate": 4.998688922613787e-07, | |
| "loss": 6.705522537231445e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4659126400947571, | |
| "reward_std": 0.4433140754699707, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.46591266989707947, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.46012043952941895, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1754.07421875, | |
| "completions/min_length": 666.0, | |
| "epoch": 0.023809523809523808, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11995636331377409, | |
| "kl": 4.119202509400566e-05, | |
| "learning_rate": 4.997669354222401e-07, | |
| "loss": -1.4901161193847656e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.5474852323532104, | |
| "reward_std": 0.34531816840171814, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5474852323532104, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.37691301107406616, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3899.0, | |
| "completions/mean_length": 1698.69921875, | |
| "completions/min_length": 745.0, | |
| "epoch": 0.027210884353741496, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13098214536619826, | |
| "kl": 5.105331194954488e-05, | |
| "learning_rate": 4.996358684296693e-07, | |
| "loss": 8.754432201385498e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.5156557559967041, | |
| "reward_std": 0.4053615927696228, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5156557559967041, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4166768491268158, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5423.0, | |
| "completions/mean_length": 1771.01953125, | |
| "completions/min_length": 692.0, | |
| "epoch": 0.030612244897959183, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12797291368590355, | |
| "kl": 5.446028086453225e-05, | |
| "learning_rate": 4.994757065594279e-07, | |
| "loss": -1.1175870895385742e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4673098921775818, | |
| "reward_std": 0.37777355313301086, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4673098921775818, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.400544136762619, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1822.95703125, | |
| "completions/min_length": 688.0, | |
| "epoch": 0.034013605442176874, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12201373456484678, | |
| "kl": 4.741260931950819e-05, | |
| "learning_rate": 4.992864684782648e-07, | |
| "loss": 1.1641532182693481e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.36175915598869324, | |
| "reward_std": 0.4400799870491028, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.36175912618637085, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44151124358177185, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4675.0, | |
| "completions/mean_length": 1797.35546875, | |
| "completions/min_length": 707.0, | |
| "epoch": 0.03741496598639456, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12392330273911538, | |
| "kl": 4.906168550178336e-05, | |
| "learning_rate": 4.9906817624174e-07, | |
| "loss": 4.7963112592697144e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4319838285446167, | |
| "reward_std": 0.4490795135498047, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4319838285446167, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.477209210395813, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4950.0, | |
| "completions/mean_length": 1725.4921875, | |
| "completions/min_length": 631.0, | |
| "epoch": 0.04081632653061224, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13305305669597486, | |
| "kl": 5.819839469722865e-05, | |
| "learning_rate": 4.988208552916535e-07, | |
| "loss": 5.587935447692871e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.3035702109336853, | |
| "reward_std": 0.4499426484107971, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3035701811313629, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4780357778072357, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1741.2734375, | |
| "completions/min_length": 597.0, | |
| "epoch": 0.04421768707482993, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13073505697177595, | |
| "kl": 5.914523239880509e-05, | |
| "learning_rate": 4.98544534453081e-07, | |
| "loss": 2.60770320892334e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.3342248797416687, | |
| "reward_std": 0.5042227506637573, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3342248797416687, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.5114654302597046, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3818.0, | |
| "completions/mean_length": 1577.34765625, | |
| "completions/min_length": 601.0, | |
| "epoch": 0.047619047619047616, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13342383735176347, | |
| "kl": 5.705263799882232e-05, | |
| "learning_rate": 4.98239245931014e-07, | |
| "loss": 1.4901161193847656e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.5360592603683472, | |
| "reward_std": 0.3911046087741852, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5360592007637024, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4391091465950012, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4162.0, | |
| "completions/mean_length": 1765.125, | |
| "completions/min_length": 585.0, | |
| "epoch": 0.05102040816326531, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12774705064954062, | |
| "kl": 5.5505841203284945e-05, | |
| "learning_rate": 4.979050253066063e-07, | |
| "loss": 1.6763806343078613e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.38956862688064575, | |
| "reward_std": 0.49937379360198975, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.38956862688064575, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.48975542187690735, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5066.0, | |
| "completions/mean_length": 1762.51171875, | |
| "completions/min_length": 549.0, | |
| "epoch": 0.05442176870748299, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.124432419027365, | |
| "kl": 5.724478234014896e-05, | |
| "learning_rate": 4.975419115330267e-07, | |
| "loss": 5.21540641784668e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4735792279243469, | |
| "reward_std": 0.46047526597976685, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4735792279243469, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4626067876815796, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4209.0, | |
| "completions/mean_length": 1677.42578125, | |
| "completions/min_length": 572.0, | |
| "epoch": 0.05782312925170068, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12903009670717827, | |
| "kl": 5.789786484911019e-05, | |
| "learning_rate": 4.971499469309197e-07, | |
| "loss": 2.2351741790771484e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.35853224992752075, | |
| "reward_std": 0.4827578067779541, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.35853227972984314, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.49547749757766724, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3885.0, | |
| "completions/mean_length": 1666.55078125, | |
| "completions/min_length": 696.0, | |
| "epoch": 0.061224489795918366, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12695146741790425, | |
| "kl": 6.25709384394213e-05, | |
| "learning_rate": 4.967291771834726e-07, | |
| "loss": 1.0244548320770264e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4618152379989624, | |
| "reward_std": 0.47193068265914917, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4618152379989624, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4882770776748657, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7093.0, | |
| "completions/mean_length": 1748.77734375, | |
| "completions/min_length": 520.0, | |
| "epoch": 0.06462585034013606, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12396913356347632, | |
| "kl": 5.553931009671942e-05, | |
| "learning_rate": 4.962796513310916e-07, | |
| "loss": 6.705522537231445e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.39511638879776, | |
| "reward_std": 0.48685652017593384, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.39511638879776, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.48999226093292236, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5014.0, | |
| "completions/mean_length": 1794.609375, | |
| "completions/min_length": 680.0, | |
| "epoch": 0.06802721088435375, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12764640221320528, | |
| "kl": 6.153282583909458e-05, | |
| "learning_rate": 4.958014217656854e-07, | |
| "loss": 7.450580596923828e-09, | |
| "num_turns": 2.0, | |
| "reward": 0.3481443524360657, | |
| "reward_std": 0.49198833107948303, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.34814438223838806, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.48914164304733276, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3801.0, | |
| "completions/mean_length": 1760.40625, | |
| "completions/min_length": 660.0, | |
| "epoch": 0.07142857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12593800576982925, | |
| "kl": 6.070601955343591e-05, | |
| "learning_rate": 4.952945442245597e-07, | |
| "loss": 2.3655593395233154e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.33063238859176636, | |
| "reward_std": 0.48998257517814636, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.33063238859176636, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.501966118812561, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5325.0, | |
| "completions/mean_length": 1718.546875, | |
| "completions/min_length": 560.0, | |
| "epoch": 0.07482993197278912, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13000811471416104, | |
| "kl": 6.788247685562965e-05, | |
| "learning_rate": 4.947590777839208e-07, | |
| "loss": 1.2665987014770508e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.39335066080093384, | |
| "reward_std": 0.49354591965675354, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.39335066080093384, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4913029074668884, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4239.0, | |
| "completions/mean_length": 1644.57421875, | |
| "completions/min_length": 532.0, | |
| "epoch": 0.0782312925170068, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1306033618236287, | |
| "kl": 6.613362518237409e-05, | |
| "learning_rate": 4.941950848519903e-07, | |
| "loss": -3.725290298461914e-09, | |
| "num_turns": 2.0, | |
| "reward": 0.4827514886856079, | |
| "reward_std": 0.47881123423576355, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4827515184879303, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.47407078742980957, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4696.0, | |
| "completions/mean_length": 1832.0859375, | |
| "completions/min_length": 771.0, | |
| "epoch": 0.08163265306122448, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12365620608315255, | |
| "kl": 6.383584820923716e-05, | |
| "learning_rate": 4.936026311617316e-07, | |
| "loss": 5.029141902923584e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.36635422706604004, | |
| "reward_std": 0.4653104245662689, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.36635422706604004, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4677668511867523, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5258.0, | |
| "completions/mean_length": 1764.4765625, | |
| "completions/min_length": 668.0, | |
| "epoch": 0.08503401360544217, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12675449588257132, | |
| "kl": 7.160034897424339e-05, | |
| "learning_rate": 4.92981785763188e-07, | |
| "loss": 2.7939677238464355e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.3244081139564514, | |
| "reward_std": 0.4639052748680115, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3244081437587738, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4596010744571686, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4426.0, | |
| "completions/mean_length": 1652.34375, | |
| "completions/min_length": 724.0, | |
| "epoch": 0.08843537414965986, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13265008990734048, | |
| "kl": 6.956292804716213e-05, | |
| "learning_rate": 4.923326210154364e-07, | |
| "loss": 1.862645149230957e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.5124205350875854, | |
| "reward_std": 0.4297581613063812, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5124205350875854, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44342154264450073, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5193.0, | |
| "completions/mean_length": 1714.58203125, | |
| "completions/min_length": 669.0, | |
| "epoch": 0.09183673469387756, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13091734914956202, | |
| "kl": 6.905263046519394e-05, | |
| "learning_rate": 4.916552125781528e-07, | |
| "loss": 2.2351741790771484e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.40827929973602295, | |
| "reward_std": 0.5019749402999878, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.40827929973602295, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.49093514680862427, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5548.0, | |
| "completions/mean_length": 1680.0234375, | |
| "completions/min_length": 727.0, | |
| "epoch": 0.09523809523809523, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12975055686418424, | |
| "kl": 7.241910009270214e-05, | |
| "learning_rate": 4.909496394027944e-07, | |
| "loss": 1.7508864402770996e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.464019238948822, | |
| "reward_std": 0.46882909536361694, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.464019238948822, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4726778268814087, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7339.0, | |
| "completions/mean_length": 1737.5703125, | |
| "completions/min_length": 550.0, | |
| "epoch": 0.09863945578231292, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12663487691300868, | |
| "kl": 7.121374130747427e-05, | |
| "learning_rate": 4.902159837233984e-07, | |
| "loss": 8.568167686462402e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.3782956600189209, | |
| "reward_std": 0.5221479535102844, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3782956600189209, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.5246500968933105, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4165.0, | |
| "completions/mean_length": 1711.59765625, | |
| "completions/min_length": 574.0, | |
| "epoch": 0.10204081632653061, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13567792821976454, | |
| "kl": 7.378515601885738e-05, | |
| "learning_rate": 4.894543310469967e-07, | |
| "loss": 1.1734664440155029e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4272693991661072, | |
| "reward_std": 0.5026066899299622, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4272693693637848, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.5159730911254883, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6039.0, | |
| "completions/mean_length": 1723.34765625, | |
| "completions/min_length": 658.0, | |
| "epoch": 0.1054421768707483, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1220857505904424, | |
| "kl": 7.243182494676148e-05, | |
| "learning_rate": 4.886647701436513e-07, | |
| "loss": 7.636845111846924e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.43645182251930237, | |
| "reward_std": 0.4528648257255554, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.43645179271698, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4559268057346344, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4791.0, | |
| "completions/mean_length": 1782.40625, | |
| "completions/min_length": 754.0, | |
| "epoch": 0.10884353741496598, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12742842792578463, | |
| "kl": 7.836230156499369e-05, | |
| "learning_rate": 4.878473930361071e-07, | |
| "loss": 5.494803190231323e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4406846761703491, | |
| "reward_std": 0.47518694400787354, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4406846761703491, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.489067018032074, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7646.0, | |
| "completions/mean_length": 1724.53125, | |
| "completions/min_length": 596.0, | |
| "epoch": 0.11224489795918367, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12769787449282893, | |
| "kl": 7.334854217333486e-05, | |
| "learning_rate": 4.870022949890676e-07, | |
| "loss": 6.05359673500061e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.38206565380096436, | |
| "reward_std": 0.4586794376373291, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.38206565380096436, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.46659746766090393, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4114.0, | |
| "completions/mean_length": 1700.08203125, | |
| "completions/min_length": 646.0, | |
| "epoch": 0.11564625850340136, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1292062463672102, | |
| "kl": 7.518993618305103e-05, | |
| "learning_rate": 4.861295744980913e-07, | |
| "loss": 8.940696716308594e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4325985312461853, | |
| "reward_std": 0.40971022844314575, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4325985312461853, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4278082549571991, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4858.0, | |
| "completions/mean_length": 1868.4921875, | |
| "completions/min_length": 547.0, | |
| "epoch": 0.11904761904761904, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12345204127675038, | |
| "kl": 7.450408247677842e-05, | |
| "learning_rate": 4.852293332781124e-07, | |
| "loss": 5.727633833885193e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.35513925552368164, | |
| "reward_std": 0.44654712080955505, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.35513922572135925, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.45481741428375244, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1713.609375, | |
| "completions/min_length": 561.0, | |
| "epoch": 0.12244897959183673, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12510015631970176, | |
| "kl": 7.508149332124958e-05, | |
| "learning_rate": 4.843016762515859e-07, | |
| "loss": 8.940696716308594e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.40725451707839966, | |
| "reward_std": 0.4564969539642334, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.40725451707839966, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.46071183681488037, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3718.0, | |
| "completions/mean_length": 1622.859375, | |
| "completions/min_length": 480.0, | |
| "epoch": 0.12585034013605442, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1286319398978756, | |
| "kl": 7.390474104340683e-05, | |
| "learning_rate": 4.833467115362589e-07, | |
| "loss": 1.1548399925231934e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.47519996762275696, | |
| "reward_std": 0.3644979000091553, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.47519993782043457, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.380415678024292, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4462.0, | |
| "completions/mean_length": 1670.49609375, | |
| "completions/min_length": 571.0, | |
| "epoch": 0.1292517006802721, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13038416386547164, | |
| "kl": 7.876049130572937e-05, | |
| "learning_rate": 4.823645504325699e-07, | |
| "loss": 7.450580596923828e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.3584836721420288, | |
| "reward_std": 0.4540232717990875, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3584836721420288, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44441646337509155, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5251.0, | |
| "completions/mean_length": 1839.046875, | |
| "completions/min_length": 490.0, | |
| "epoch": 0.1326530612244898, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12463579695824678, | |
| "kl": 7.707584961735847e-05, | |
| "learning_rate": 4.81355307410676e-07, | |
| "loss": 7.82310962677002e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.41247132420539856, | |
| "reward_std": 0.4671492278575897, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.41247132420539856, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4722498953342438, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7413.0, | |
| "completions/mean_length": 1708.32421875, | |
| "completions/min_length": 477.0, | |
| "epoch": 0.1360544217687075, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13875688765149183, | |
| "kl": 8.656998204514821e-05, | |
| "learning_rate": 4.803191000971128e-07, | |
| "loss": -3.725290298461914e-09, | |
| "num_turns": 2.0, | |
| "reward": 0.36401766538619995, | |
| "reward_std": 0.4898770749568939, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.36401766538619995, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.48293355107307434, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5456.0, | |
| "completions/mean_length": 1664.0078125, | |
| "completions/min_length": 694.0, | |
| "epoch": 0.13945578231292516, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12606923287028104, | |
| "kl": 8.074963511717215e-05, | |
| "learning_rate": 4.792560492610835e-07, | |
| "loss": 7.450580596923828e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4370952248573303, | |
| "reward_std": 0.3985515832901001, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4370952248573303, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39930838346481323, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7162.0, | |
| "completions/mean_length": 1715.57421875, | |
| "completions/min_length": 603.0, | |
| "epoch": 0.14285714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1349781738713447, | |
| "kl": 8.651554469452094e-05, | |
| "learning_rate": 4.78166278800385e-07, | |
| "loss": 1.043081283569336e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.36490052938461304, | |
| "reward_std": 0.45700886845588684, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.36490052938461304, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4708576202392578, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1832.96484375, | |
| "completions/min_length": 748.0, | |
| "epoch": 0.14625850340136054, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12229902259500831, | |
| "kl": 7.565521184460522e-05, | |
| "learning_rate": 4.770499157269663e-07, | |
| "loss": -3.4458935260772705e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.47392046451568604, | |
| "reward_std": 0.4232255220413208, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.47392046451568604, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4284113347530365, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6068.0, | |
| "completions/mean_length": 1806.5859375, | |
| "completions/min_length": 665.0, | |
| "epoch": 0.14965986394557823, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12303543586950182, | |
| "kl": 7.862668167035736e-05, | |
| "learning_rate": 4.7590709015212633e-07, | |
| "loss": 6.146728992462158e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.14448022842407227, | |
| "reward_std": 0.5132663249969482, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.14448022842407227, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.5187545418739319, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1684.65234375, | |
| "completions/min_length": 656.0, | |
| "epoch": 0.15306122448979592, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12705093094712636, | |
| "kl": 8.699895329300489e-05, | |
| "learning_rate": 4.747379352713488e-07, | |
| "loss": 8.940696716308594e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.43057385087013245, | |
| "reward_std": 0.42482852935791016, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.43057382106781006, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4353935718536377, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5181.0, | |
| "completions/mean_length": 1747.23046875, | |
| "completions/min_length": 646.0, | |
| "epoch": 0.1564625850340136, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13013278675105897, | |
| "kl": 8.532286244644638e-05, | |
| "learning_rate": 4.7354258734877906e-07, | |
| "loss": 7.82310962677002e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.21112340688705444, | |
| "reward_std": 0.49027031660079956, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.21112340688705444, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.5018395781517029, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5155.0, | |
| "completions/mean_length": 1782.39453125, | |
| "completions/min_length": 626.0, | |
| "epoch": 0.1598639455782313, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12503700814380458, | |
| "kl": 8.357773197076312e-05, | |
| "learning_rate": 4.7232118570134227e-07, | |
| "loss": 2.9802322387695312e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.46107158064842224, | |
| "reward_std": 0.4947185516357422, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.46107155084609985, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4803822934627533, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3971.0, | |
| "completions/mean_length": 1709.5, | |
| "completions/min_length": 567.0, | |
| "epoch": 0.16326530612244897, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12837085947616933, | |
| "kl": 8.580618555242836e-05, | |
| "learning_rate": 4.7107387268250586e-07, | |
| "loss": 6.705522537231445e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.39588984847068787, | |
| "reward_std": 0.4714891314506531, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3958898186683655, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.46815717220306396, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5086.0, | |
| "completions/mean_length": 1820.640625, | |
| "completions/min_length": 791.0, | |
| "epoch": 0.16666666666666666, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12510105746824587, | |
| "kl": 8.930987769417698e-05, | |
| "learning_rate": 4.69800793665689e-07, | |
| "loss": 1.1548399925231934e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.28018033504486084, | |
| "reward_std": 0.4455254077911377, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.28018033504486084, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.46793651580810547, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1757.30078125, | |
| "completions/min_length": 697.0, | |
| "epoch": 0.17006802721088435, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1205305455949019, | |
| "kl": 9.023273048569536e-05, | |
| "learning_rate": 4.685020970273189e-07, | |
| "loss": 1.0803341865539551e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.34440556168556213, | |
| "reward_std": 0.42845094203948975, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.34440553188323975, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42874884605407715, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4636.0, | |
| "completions/mean_length": 1805.30859375, | |
| "completions/min_length": 759.0, | |
| "epoch": 0.17346938775510204, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12358348328235372, | |
| "kl": 9.223375809597201e-05, | |
| "learning_rate": 4.6717793412953776e-07, | |
| "loss": 1.4528632164001465e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.35937339067459106, | |
| "reward_std": 0.45837679505348206, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.35937339067459106, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.45718511939048767, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4918.0, | |
| "completions/mean_length": 1776.66015625, | |
| "completions/min_length": 468.0, | |
| "epoch": 0.17687074829931973, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12023113902964513, | |
| "kl": 9.91547349258326e-05, | |
| "learning_rate": 4.6582845930256166e-07, | |
| "loss": 6.146728992462158e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4752587080001831, | |
| "reward_std": 0.46407151222229004, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4752587080001831, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4792989492416382, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4763.0, | |
| "completions/mean_length": 1784.984375, | |
| "completions/min_length": 632.0, | |
| "epoch": 0.18027210884353742, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12500207873670774, | |
| "kl": 0.00010303906310582533, | |
| "learning_rate": 4.6445382982669354e-07, | |
| "loss": 8.940696716308594e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.31531599164009094, | |
| "reward_std": 0.46134036779403687, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.31531602144241333, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.47073715925216675, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7204.0, | |
| "completions/mean_length": 1856.4453125, | |
| "completions/min_length": 538.0, | |
| "epoch": 0.1836734693877551, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12421921518476847, | |
| "kl": 9.920350612446782e-05, | |
| "learning_rate": 4.630542059139923e-07, | |
| "loss": 6.705522537231445e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.33992254734039307, | |
| "reward_std": 0.5055447220802307, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.33992254734039307, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4955473840236664, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1686.87890625, | |
| "completions/min_length": 479.0, | |
| "epoch": 0.1870748299319728, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1254120719869612, | |
| "kl": 0.00011915153527297662, | |
| "learning_rate": 4.616297506896001e-07, | |
| "loss": 1.4901161193847656e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3778541684150696, | |
| "reward_std": 0.47446173429489136, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3778541386127472, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.48409169912338257, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4574.0, | |
| "completions/mean_length": 1665.82421875, | |
| "completions/min_length": 562.0, | |
| "epoch": 0.19047619047619047, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12973197352263433, | |
| "kl": 0.00011406978615013941, | |
| "learning_rate": 4.601806301727302e-07, | |
| "loss": 1.043081283569336e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.44780969619750977, | |
| "reward_std": 0.4106203019618988, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4478096663951874, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43979954719543457, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6027.0, | |
| "completions/mean_length": 1682.44921875, | |
| "completions/min_length": 520.0, | |
| "epoch": 0.19387755102040816, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13270967450989135, | |
| "kl": 0.00012001413165307895, | |
| "learning_rate": 4.5870701325731773e-07, | |
| "loss": 6.705522537231445e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.49996864795684814, | |
| "reward_std": 0.4304153323173523, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.49996861815452576, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44452470541000366, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1712.70703125, | |
| "completions/min_length": 597.0, | |
| "epoch": 0.19727891156462585, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12258262023187534, | |
| "kl": 0.00011655640764729469, | |
| "learning_rate": 4.572090716923353e-07, | |
| "loss": 1.1175870895385742e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4167475700378418, | |
| "reward_std": 0.42564165592193604, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4167475700378418, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.45184454321861267, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4757.0, | |
| "completions/mean_length": 1797.17578125, | |
| "completions/min_length": 614.0, | |
| "epoch": 0.20068027210884354, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12136217537860085, | |
| "kl": 0.00010487847112017334, | |
| "learning_rate": 4.556869800617753e-07, | |
| "loss": 6.705522537231445e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4988711476325989, | |
| "reward_std": 0.4274623990058899, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4988711476325989, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43597930669784546, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6339.0, | |
| "completions/mean_length": 1699.640625, | |
| "completions/min_length": 574.0, | |
| "epoch": 0.20408163265306123, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12800442809508575, | |
| "kl": 0.00012965655946572952, | |
| "learning_rate": 4.541409157643027e-07, | |
| "loss": 1.210719347000122e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.47402942180633545, | |
| "reward_std": 0.48247969150543213, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.47402939200401306, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4953751862049103, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6665.0, | |
| "completions/mean_length": 1608.78515625, | |
| "completions/min_length": 572.0, | |
| "epoch": 0.20748299319727892, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13210996972657596, | |
| "kl": 0.00013131504510965897, | |
| "learning_rate": 4.5257105899257937e-07, | |
| "loss": 2.0302832126617432e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.47677308320999146, | |
| "reward_std": 0.464009165763855, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.47677308320999146, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4753335416316986, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4159.0, | |
| "completions/mean_length": 1727.4140625, | |
| "completions/min_length": 599.0, | |
| "epoch": 0.2108843537414966, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1276960509567404, | |
| "kl": 0.000136644269559838, | |
| "learning_rate": 4.5097759271226247e-07, | |
| "loss": 1.1734664440155029e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4505724310874939, | |
| "reward_std": 0.4149358868598938, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4505724310874939, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4140785038471222, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5320.0, | |
| "completions/mean_length": 1669.0546875, | |
| "completions/min_length": 527.0, | |
| "epoch": 0.21428571428571427, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13006286691884608, | |
| "kl": 0.0001404530455602071, | |
| "learning_rate": 4.4936070264068016e-07, | |
| "loss": 1.3131648302078247e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4712015986442566, | |
| "reward_std": 0.43090948462486267, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4712015986442566, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43964844942092896, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6049.0, | |
| "completions/mean_length": 1694.3203125, | |
| "completions/min_length": 589.0, | |
| "epoch": 0.21768707482993196, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1233139379942691, | |
| "kl": 0.0001400133287461358, | |
| "learning_rate": 4.477205772251864e-07, | |
| "loss": 1.862645149230957e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.45948171615600586, | |
| "reward_std": 0.385934054851532, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.45948168635368347, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4270806610584259, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3535.0, | |
| "completions/mean_length": 1640.0234375, | |
| "completions/min_length": 534.0, | |
| "epoch": 0.22108843537414966, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13305210668094364, | |
| "kl": 0.0001618890607915091, | |
| "learning_rate": 4.4605740762119726e-07, | |
| "loss": 2.4028122425079346e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5881233811378479, | |
| "reward_std": 0.3828655481338501, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5881233811378479, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.38702020049095154, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6117.0, | |
| "completions/mean_length": 1656.7265625, | |
| "completions/min_length": 551.0, | |
| "epoch": 0.22448979591836735, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12517742318338876, | |
| "kl": 0.00014426143570744898, | |
| "learning_rate": 4.443713876699123e-07, | |
| "loss": 8.754432201385498e-08, | |
| "num_turns": 2.0, | |
| "reward": 0.4771096706390381, | |
| "reward_std": 0.4157760739326477, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4771096706390381, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.45321065187454224, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7229.0, | |
| "completions/mean_length": 1641.0703125, | |
| "completions/min_length": 445.0, | |
| "epoch": 0.22789115646258504, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13614948043808453, | |
| "kl": 0.00016474105905217584, | |
| "learning_rate": 4.426627138757223e-07, | |
| "loss": 1.8812716007232666e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5012058019638062, | |
| "reward_std": 0.4257659912109375, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5012058019638062, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.435712069272995, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5974.0, | |
| "completions/mean_length": 1757.94921875, | |
| "completions/min_length": 642.0, | |
| "epoch": 0.23129251700680273, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13400741095314625, | |
| "kl": 0.00016489506492689543, | |
| "learning_rate": 4.409315853833067e-07, | |
| "loss": 1.564621925354004e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3585292398929596, | |
| "reward_std": 0.4674282670021057, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3585292100906372, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4627254009246826, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1691.52734375, | |
| "completions/min_length": 555.0, | |
| "epoch": 0.23469387755102042, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13189680063936968, | |
| "kl": 0.00018260819911120052, | |
| "learning_rate": 4.391782039544238e-07, | |
| "loss": 2.1979212760925293e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.38688501715660095, | |
| "reward_std": 0.44273632764816284, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.38688501715660095, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44483256340026855, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5656.0, | |
| "completions/mean_length": 1771.47265625, | |
| "completions/min_length": 577.0, | |
| "epoch": 0.23809523809523808, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11942098230903217, | |
| "kl": 0.00015966588580340613, | |
| "learning_rate": 4.374027739443952e-07, | |
| "loss": 1.9744038581848145e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4191434979438782, | |
| "reward_std": 0.39943796396255493, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.41914352774620056, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4300435185432434, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5968.0, | |
| "completions/mean_length": 1812.03125, | |
| "completions/min_length": 576.0, | |
| "epoch": 0.24149659863945577, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12415689131892878, | |
| "kl": 0.00016537675492145354, | |
| "learning_rate": 4.3560550227828834e-07, | |
| "loss": 1.7508864402770996e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5332323312759399, | |
| "reward_std": 0.3747295141220093, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5332322716712952, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3812965452671051, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5582.0, | |
| "completions/mean_length": 1716.89453125, | |
| "completions/min_length": 589.0, | |
| "epoch": 0.24489795918367346, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1249995771267063, | |
| "kl": 0.0001809181378575886, | |
| "learning_rate": 4.337865984268001e-07, | |
| "loss": 1.9371509552001953e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.43725067377090454, | |
| "reward_std": 0.42010611295700073, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.43725070357322693, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43573060631752014, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7308.0, | |
| "completions/mean_length": 1728.125, | |
| "completions/min_length": 579.0, | |
| "epoch": 0.24829931972789115, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12553968210728977, | |
| "kl": 0.00020021227760480542, | |
| "learning_rate": 4.3194627438184233e-07, | |
| "loss": 1.7508864402770996e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4539065957069397, | |
| "reward_std": 0.46171799302101135, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4539065957069397, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.457562118768692, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5294.0, | |
| "completions/mean_length": 1772.98828125, | |
| "completions/min_length": 695.0, | |
| "epoch": 0.25170068027210885, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11900417608479627, | |
| "kl": 0.00018830790418178367, | |
| "learning_rate": 4.3008474463183496e-07, | |
| "loss": 2.086162567138672e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4869869351387024, | |
| "reward_std": 0.4232376515865326, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4869869649410248, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4423135221004486, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1740.9140625, | |
| "completions/min_length": 516.0, | |
| "epoch": 0.25510204081632654, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1256588784763905, | |
| "kl": 0.00020819194855903334, | |
| "learning_rate": 4.282022261367073e-07, | |
| "loss": 2.2351741790771484e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.48140135407447815, | |
| "reward_std": 0.40134385228157043, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.48140132427215576, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44396087527275085, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5386.0, | |
| "completions/mean_length": 1720.82421875, | |
| "completions/min_length": 624.0, | |
| "epoch": 0.2585034013605442, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1261532919992208, | |
| "kl": 0.00020027517689413799, | |
| "learning_rate": 4.262989383026114e-07, | |
| "loss": 2.4400651454925537e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.33791497349739075, | |
| "reward_std": 0.47338148951530457, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.33791500329971313, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4666896462440491, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5895.0, | |
| "completions/mean_length": 1673.328125, | |
| "completions/min_length": 458.0, | |
| "epoch": 0.2619047619047619, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1281227994019189, | |
| "kl": 0.0002022001203840773, | |
| "learning_rate": 4.243751029563507e-07, | |
| "loss": 1.6391277313232422e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.49508824944496155, | |
| "reward_std": 0.42326685786247253, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.49508827924728394, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44494879245758057, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1793.4453125, | |
| "completions/min_length": 737.0, | |
| "epoch": 0.2653061224489796, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12400128210142895, | |
| "kl": 0.00018887607643591764, | |
| "learning_rate": 4.2243094431952607e-07, | |
| "loss": 1.993030309677124e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3935734033584595, | |
| "reward_std": 0.4245133399963379, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3935733735561371, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4357367157936096, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5328.0, | |
| "completions/mean_length": 1590.703125, | |
| "completions/min_length": 621.0, | |
| "epoch": 0.2687074829931973, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1276264683063012, | |
| "kl": 0.0002454274231240561, | |
| "learning_rate": 4.2046668898240296e-07, | |
| "loss": 2.1792948246002197e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4903767704963684, | |
| "reward_std": 0.41225385665893555, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.49037671089172363, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4092792570590973, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5005.0, | |
| "completions/mean_length": 1648.64453125, | |
| "completions/min_length": 449.0, | |
| "epoch": 0.272108843537415, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13886289723410575, | |
| "kl": 0.0002334505954877386, | |
| "learning_rate": 4.184825658775027e-07, | |
| "loss": 1.993030309677124e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4053865075111389, | |
| "reward_std": 0.40530964732170105, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4053865373134613, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4158993363380432, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4601.0, | |
| "completions/mean_length": 1696.8515625, | |
| "completions/min_length": 533.0, | |
| "epoch": 0.2755102040816326, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12564175760915286, | |
| "kl": 0.00022438414066527912, | |
| "learning_rate": 4.1647880625292027e-07, | |
| "loss": 2.3562461137771606e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4553804397583008, | |
| "reward_std": 0.37617209553718567, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.45538046956062317, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3870598375797272, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5275.0, | |
| "completions/mean_length": 1695.30078125, | |
| "completions/min_length": 468.0, | |
| "epoch": 0.2789115646258503, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12220072275161292, | |
| "kl": 0.00022701559555571293, | |
| "learning_rate": 4.1445564364537266e-07, | |
| "loss": 1.9371509552001953e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5103209614753723, | |
| "reward_std": 0.3658777177333832, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5103209018707275, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.38078218698501587, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6187.0, | |
| "completions/mean_length": 1759.6953125, | |
| "completions/min_length": 416.0, | |
| "epoch": 0.282312925170068, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1282708212338203, | |
| "kl": 0.00020942796845702105, | |
| "learning_rate": 4.124133138529803e-07, | |
| "loss": 2.1141022443771362e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3109988272190094, | |
| "reward_std": 0.452178031206131, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.310998797416687, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4474020004272461, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1670.4296875, | |
| "completions/min_length": 514.0, | |
| "epoch": 0.2857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1303929574716558, | |
| "kl": 0.0002468315587975667, | |
| "learning_rate": 4.1035205490778496e-07, | |
| "loss": 3.0919909477233887e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5012885332107544, | |
| "reward_std": 0.499253511428833, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5012885332107544, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4859963655471802, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6447.0, | |
| "completions/mean_length": 1774.4921875, | |
| "completions/min_length": 523.0, | |
| "epoch": 0.2891156462585034, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1321627252917682, | |
| "kl": 0.00023074077262208448, | |
| "learning_rate": 4.0827210704800745e-07, | |
| "loss": 1.043081283569336e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4319390058517456, | |
| "reward_std": 0.48060840368270874, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4319390058517456, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.49155551195144653, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4836.0, | |
| "completions/mean_length": 1665.26171875, | |
| "completions/min_length": 549.0, | |
| "epoch": 0.2925170068027211, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12803690948396568, | |
| "kl": 0.00025092134728765814, | |
| "learning_rate": 4.061737126900478e-07, | |
| "loss": 3.8929283618927e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5961266160011292, | |
| "reward_std": 0.36096709966659546, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5961266160011292, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39821961522102356, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6643.0, | |
| "completions/mean_length": 1680.0703125, | |
| "completions/min_length": 548.0, | |
| "epoch": 0.29591836734693877, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1290457837944881, | |
| "kl": 0.00026406801862322027, | |
| "learning_rate": 4.040571164002318e-07, | |
| "loss": 2.7194619178771973e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3678485155105591, | |
| "reward_std": 0.4080837368965149, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3678485155105591, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4086749851703644, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4514.0, | |
| "completions/mean_length": 1608.74609375, | |
| "completions/min_length": 561.0, | |
| "epoch": 0.29931972789115646, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12673992847722323, | |
| "kl": 0.00025539301213939325, | |
| "learning_rate": 4.019225648663072e-07, | |
| "loss": 2.551823854446411e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5111918449401855, | |
| "reward_std": 0.3585626482963562, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5111918449401855, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3727354109287262, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6841.0, | |
| "completions/mean_length": 1656.125, | |
| "completions/min_length": 541.0, | |
| "epoch": 0.30272108843537415, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1275437551932911, | |
| "kl": 0.00023101651822798885, | |
| "learning_rate": 3.997703068686923e-07, | |
| "loss": 2.123415470123291e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4530450105667114, | |
| "reward_std": 0.40583527088165283, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4530450105667114, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4039011299610138, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5366.0, | |
| "completions/mean_length": 1626.859375, | |
| "completions/min_length": 506.0, | |
| "epoch": 0.30612244897959184, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12855829116477677, | |
| "kl": 0.00025528713968014927, | |
| "learning_rate": 3.9760059325148063e-07, | |
| "loss": 1.30385160446167e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4176099896430969, | |
| "reward_std": 0.43359875679016113, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4176099896430969, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44678640365600586, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5738.0, | |
| "completions/mean_length": 1691.70703125, | |
| "completions/min_length": 483.0, | |
| "epoch": 0.30952380952380953, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12543869877212768, | |
| "kl": 0.00025521306588416337, | |
| "learning_rate": 3.954136768932056e-07, | |
| "loss": 2.8032809495925903e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.554349422454834, | |
| "reward_std": 0.38532671332359314, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.554349422454834, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4123691916465759, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6134.0, | |
| "completions/mean_length": 1643.35546875, | |
| "completions/min_length": 573.0, | |
| "epoch": 0.3129251700680272, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13128657513735853, | |
| "kl": 0.00027815132762043504, | |
| "learning_rate": 3.932098126773674e-07, | |
| "loss": 2.3096799850463867e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4468424320220947, | |
| "reward_std": 0.3695867657661438, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4468424320220947, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.37673676013946533, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5820.0, | |
| "completions/mean_length": 1658.8046875, | |
| "completions/min_length": 500.0, | |
| "epoch": 0.3163265306122449, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1232640314353311, | |
| "kl": 0.0002867808389055426, | |
| "learning_rate": 3.909892574627266e-07, | |
| "loss": 3.203749656677246e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.431984543800354, | |
| "reward_std": 0.4377307593822479, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.431984543800354, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4598510265350342, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5530.0, | |
| "completions/mean_length": 1767.25390625, | |
| "completions/min_length": 391.0, | |
| "epoch": 0.3197278911564626, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13394884598636536, | |
| "kl": 0.00028032092995999847, | |
| "learning_rate": 3.887522700533675e-07, | |
| "loss": 2.4028122425079346e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.45087701082229614, | |
| "reward_std": 0.4690535068511963, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.45087701082229614, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.491955041885376, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6639.0, | |
| "completions/mean_length": 1665.76171875, | |
| "completions/min_length": 687.0, | |
| "epoch": 0.3231292517006803, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12812015596809412, | |
| "kl": 0.0002728628523982479, | |
| "learning_rate": 3.864991111685345e-07, | |
| "loss": 2.7939677238464355e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.48972389101982117, | |
| "reward_std": 0.4347324073314667, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4897238612174988, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43040478229522705, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1657.66015625, | |
| "completions/min_length": 563.0, | |
| "epoch": 0.32653061224489793, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12430643043240865, | |
| "kl": 0.00028989990596528514, | |
| "learning_rate": 3.8423004341224595e-07, | |
| "loss": 2.7567148208618164e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.33566781878471375, | |
| "reward_std": 0.5082817077636719, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.33566781878471375, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.49739500880241394, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4475.0, | |
| "completions/mean_length": 1591.48046875, | |
| "completions/min_length": 546.0, | |
| "epoch": 0.3299319727891156, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1262900325387896, | |
| "kl": 0.00032333704984921496, | |
| "learning_rate": 3.819453312426871e-07, | |
| "loss": 2.644956111907959e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3694228529930115, | |
| "reward_std": 0.4831184148788452, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3694228529930115, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.500664472579956, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3747.0, | |
| "completions/mean_length": 1610.03515625, | |
| "completions/min_length": 546.0, | |
| "epoch": 0.3333333333333333, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1258432953697267, | |
| "kl": 0.0002983086321819428, | |
| "learning_rate": 3.796452409413887e-07, | |
| "loss": 3.0547380447387695e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5325152277946472, | |
| "reward_std": 0.4058530330657959, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5325151681900024, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41554924845695496, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4152.0, | |
| "completions/mean_length": 1535.42578125, | |
| "completions/min_length": 470.0, | |
| "epoch": 0.336734693877551, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13426108423653652, | |
| "kl": 0.0003142357822980557, | |
| "learning_rate": 3.773300405821908e-07, | |
| "loss": 4.10713255405426e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5315067768096924, | |
| "reward_std": 0.37495577335357666, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5315067768096924, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.38837048411369324, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4829.0, | |
| "completions/mean_length": 1651.390625, | |
| "completions/min_length": 535.0, | |
| "epoch": 0.3401360544217687, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1304295648701895, | |
| "kl": 0.000332591352162126, | |
| "learning_rate": 3.75e-07, | |
| "loss": 3.501772880554199e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3912454843521118, | |
| "reward_std": 0.45414209365844727, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3912455141544342, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4752326011657715, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4701.0, | |
| "completions/mean_length": 1614.3359375, | |
| "completions/min_length": 570.0, | |
| "epoch": 0.3435374149659864, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1245550118423448, | |
| "kl": 0.00038185133644219604, | |
| "learning_rate": 3.726553907593401e-07, | |
| "loss": 3.594905138015747e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.42279961705207825, | |
| "reward_std": 0.3496551215648651, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.42279961705207825, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.35374781489372253, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1682.421875, | |
| "completions/min_length": 566.0, | |
| "epoch": 0.3469387755102041, | |
| "frac_reward_zero_std": 0.0625, | |
| "grad_norm": 0.12739944992531263, | |
| "kl": 0.0003275773358382139, | |
| "learning_rate": 3.7029648612270123e-07, | |
| "loss": 3.2782554626464844e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.2956712245941162, | |
| "reward_std": 0.4861956238746643, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.2956712245941162, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.5181453227996826, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1673.75390625, | |
| "completions/min_length": 562.0, | |
| "epoch": 0.35034013605442177, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13445060320474667, | |
| "kl": 0.000384508166462183, | |
| "learning_rate": 3.6792356101869156e-07, | |
| "loss": 3.725290298461914e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3457567095756531, | |
| "reward_std": 0.39632177352905273, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3457567095756531, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41148948669433594, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1538.5703125, | |
| "completions/min_length": 458.0, | |
| "epoch": 0.35374149659863946, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13289507816296844, | |
| "kl": 0.00042833897032323875, | |
| "learning_rate": 3.655368920099942e-07, | |
| "loss": 3.7997961044311523e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4717833995819092, | |
| "reward_std": 0.3555891215801239, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4717833995819092, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3742328882217407, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5154.0, | |
| "completions/mean_length": 1680.16015625, | |
| "completions/min_length": 502.0, | |
| "epoch": 0.35714285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12610684349294984, | |
| "kl": 0.00036339681264507817, | |
| "learning_rate": 3.6313675726113475e-07, | |
| "loss": 3.650784492492676e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.42708680033683777, | |
| "reward_std": 0.4701390862464905, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.42708683013916016, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.47297847270965576, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4806.0, | |
| "completions/mean_length": 1588.52734375, | |
| "completions/min_length": 433.0, | |
| "epoch": 0.36054421768707484, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12977436985513188, | |
| "kl": 0.0004353736439952627, | |
| "learning_rate": 3.607234365060604e-07, | |
| "loss": 3.5390257835388184e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.49587902426719666, | |
| "reward_std": 0.3839839696884155, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.49587899446487427, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.40533238649368286, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6636.0, | |
| "completions/mean_length": 1518.19140625, | |
| "completions/min_length": 507.0, | |
| "epoch": 0.36394557823129253, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13218204603131029, | |
| "kl": 0.00048701832611186546, | |
| "learning_rate": 3.5829721101553826e-07, | |
| "loss": 5.308538675308228e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.41638830304145813, | |
| "reward_std": 0.38539189100265503, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.41638830304145813, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3954349458217621, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6301.0, | |
| "completions/mean_length": 1585.0234375, | |
| "completions/min_length": 454.0, | |
| "epoch": 0.3673469387755102, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12737274117239786, | |
| "kl": 0.0004323695679886441, | |
| "learning_rate": 3.558583635643726e-07, | |
| "loss": 4.507601261138916e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5039672255516052, | |
| "reward_std": 0.401162713766098, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.50396728515625, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4077160656452179, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1631.3125, | |
| "completions/min_length": 535.0, | |
| "epoch": 0.3707482993197279, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12547409000395443, | |
| "kl": 0.00047672909840912325, | |
| "learning_rate": 3.5340717839844787e-07, | |
| "loss": 4.246830940246582e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4858195185661316, | |
| "reward_std": 0.3277161717414856, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4858194887638092, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3596702814102173, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4351.0, | |
| "completions/mean_length": 1497.08984375, | |
| "completions/min_length": 647.0, | |
| "epoch": 0.3741496598639456, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13036809035265715, | |
| "kl": 0.0005725711534978473, | |
| "learning_rate": 3.509439412016004e-07, | |
| "loss": 5.327165126800537e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.41480642557144165, | |
| "reward_std": 0.4249509572982788, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.41480645537376404, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.445372611284256, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4360.0, | |
| "completions/mean_length": 1546.18359375, | |
| "completions/min_length": 463.0, | |
| "epoch": 0.37755102040816324, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13263776844617658, | |
| "kl": 0.0005119868446854525, | |
| "learning_rate": 3.484689390623218e-07, | |
| "loss": 5.178153514862061e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.39183998107910156, | |
| "reward_std": 0.4916958510875702, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.39183998107910156, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4971933960914612, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6743.0, | |
| "completions/mean_length": 1607.42578125, | |
| "completions/min_length": 533.0, | |
| "epoch": 0.38095238095238093, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1335978649489609, | |
| "kl": 0.0005102338586766564, | |
| "learning_rate": 3.4598246044029906e-07, | |
| "loss": 5.513429641723633e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3588225543498993, | |
| "reward_std": 0.4509512782096863, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3588225841522217, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.48127850890159607, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4036.0, | |
| "completions/mean_length": 1477.58203125, | |
| "completions/min_length": 466.0, | |
| "epoch": 0.3843537414965986, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12933390137700235, | |
| "kl": 0.000587871592870215, | |
| "learning_rate": 3.4348479513279484e-07, | |
| "loss": 5.550682544708252e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5935378074645996, | |
| "reward_std": 0.33069849014282227, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5935378074645996, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3454819619655609, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3709.0, | |
| "completions/mean_length": 1475.26171875, | |
| "completions/min_length": 436.0, | |
| "epoch": 0.3877551020408163, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13991771662886215, | |
| "kl": 0.0006190207309373363, | |
| "learning_rate": 3.409762342408719e-07, | |
| "loss": 6.48200511932373e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.5287143588066101, | |
| "reward_std": 0.4257807731628418, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5287143588066101, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43025365471839905, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1525.3359375, | |
| "completions/min_length": 594.0, | |
| "epoch": 0.391156462585034, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12956428174558993, | |
| "kl": 0.0006674773485428886, | |
| "learning_rate": 3.384570701354652e-07, | |
| "loss": 6.612390279769897e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.4876652956008911, | |
| "reward_std": 0.40975645184516907, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4876652956008911, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4222400188446045, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4185.0, | |
| "completions/mean_length": 1450.0, | |
| "completions/min_length": 421.0, | |
| "epoch": 0.3945578231292517, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13705414529406734, | |
| "kl": 0.0007245506872095575, | |
| "learning_rate": 3.359275964233066e-07, | |
| "loss": 7.245689630508423e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.507308840751648, | |
| "reward_std": 0.45902687311172485, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5073089003562927, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4505447745323181, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4866.0, | |
| "completions/mean_length": 1473.921875, | |
| "completions/min_length": 538.0, | |
| "epoch": 0.3979591836734694, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13099605307064782, | |
| "kl": 0.0007409593117699842, | |
| "learning_rate": 3.3338810791270517e-07, | |
| "loss": 6.780028343200684e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.3987925350666046, | |
| "reward_std": 0.4523935317993164, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.398792564868927, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4649738073348999, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3860.0, | |
| "completions/mean_length": 1428.5625, | |
| "completions/min_length": 446.0, | |
| "epoch": 0.4013605442176871, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12534450760531446, | |
| "kl": 0.0008648029947835312, | |
| "learning_rate": 3.308389005791871e-07, | |
| "loss": 9.611248970031738e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.604638934135437, | |
| "reward_std": 0.3078837990760803, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.604638934135437, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.33188706636428833, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1439.98046875, | |
| "completions/min_length": 430.0, | |
| "epoch": 0.40476190476190477, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12123023441006979, | |
| "kl": 0.0009042763977049617, | |
| "learning_rate": 3.282802715310006e-07, | |
| "loss": 9.08970832824707e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.46023768186569214, | |
| "reward_std": 0.3499431610107422, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.46023768186569214, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3669500946998596, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3627.0, | |
| "completions/mean_length": 1412.05078125, | |
| "completions/min_length": 601.0, | |
| "epoch": 0.40816326530612246, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12231666388502688, | |
| "kl": 0.0009289461663684051, | |
| "learning_rate": 3.2571251897448763e-07, | |
| "loss": 9.201467037200928e-07, | |
| "num_turns": 2.0, | |
| "reward": 0.44291263818740845, | |
| "reward_std": 0.4349104166030884, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.44291260838508606, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44798025488853455, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4455.0, | |
| "completions/mean_length": 1373.9921875, | |
| "completions/min_length": 417.0, | |
| "epoch": 0.41156462585034015, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.129259993408259, | |
| "kl": 0.0010929724148809328, | |
| "learning_rate": 3.2313594217932854e-07, | |
| "loss": 1.1213123798370361e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.3926813304424286, | |
| "reward_std": 0.4589789807796478, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3926813304424286, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4817292094230652, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1317.17578125, | |
| "completions/min_length": 514.0, | |
| "epoch": 0.41496598639455784, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13232964107425263, | |
| "kl": 0.001064850624970859, | |
| "learning_rate": 3.205508414436619e-07, | |
| "loss": 1.1045485734939575e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.47658634185791016, | |
| "reward_std": 0.46201151609420776, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.47658634185791016, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4703359305858612, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5202.0, | |
| "completions/mean_length": 1406.23828125, | |
| "completions/min_length": 566.0, | |
| "epoch": 0.41836734693877553, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1272398524800165, | |
| "kl": 0.0010337179801354068, | |
| "learning_rate": 3.179575180590857e-07, | |
| "loss": 1.0048970580101013e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5132541656494141, | |
| "reward_std": 0.39524388313293457, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5132542252540588, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41758498549461365, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4192.0, | |
| "completions/mean_length": 1318.7265625, | |
| "completions/min_length": 481.0, | |
| "epoch": 0.4217687074829932, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1322752900393093, | |
| "kl": 0.0011480792186375766, | |
| "learning_rate": 3.153562742755414e-07, | |
| "loss": 1.0300427675247192e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5350508689880371, | |
| "reward_std": 0.37076908349990845, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5350508689880371, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39439699053764343, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3802.0, | |
| "completions/mean_length": 1236.74609375, | |
| "completions/min_length": 491.0, | |
| "epoch": 0.42517006802721086, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1269238046945135, | |
| "kl": 0.001386871756039909, | |
| "learning_rate": 3.1274741326608717e-07, | |
| "loss": 1.4044344425201416e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5489836931228638, | |
| "reward_std": 0.3799617290496826, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5489837527275085, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4073076844215393, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3207.0, | |
| "completions/mean_length": 1224.5, | |
| "completions/min_length": 463.0, | |
| "epoch": 0.42857142857142855, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12528652678460261, | |
| "kl": 0.001363825639600691, | |
| "learning_rate": 3.101312390915634e-07, | |
| "loss": 1.3401731848716736e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5499537587165833, | |
| "reward_std": 0.41209450364112854, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5499536991119385, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42937520146369934, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3540.0, | |
| "completions/mean_length": 1237.4140625, | |
| "completions/min_length": 495.0, | |
| "epoch": 0.43197278911564624, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13733437857964267, | |
| "kl": 0.0014534044730680762, | |
| "learning_rate": 3.075080566651544e-07, | |
| "loss": 1.4118850231170654e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5841702818870544, | |
| "reward_std": 0.3928337097167969, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5841702818870544, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.40045365691185, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6243.0, | |
| "completions/mean_length": 1382.99609375, | |
| "completions/min_length": 428.0, | |
| "epoch": 0.43537414965986393, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12486667422213141, | |
| "kl": 0.0012770373573403049, | |
| "learning_rate": 3.048781717168513e-07, | |
| "loss": 1.261010766029358e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5601485967636108, | |
| "reward_std": 0.3312610387802124, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5601485967636108, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.35082247853279114, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3410.0, | |
| "completions/mean_length": 1243.3359375, | |
| "completions/min_length": 499.0, | |
| "epoch": 0.4387755102040816, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12080802782214185, | |
| "kl": 0.0014379564872797346, | |
| "learning_rate": 3.022418907578188e-07, | |
| "loss": 1.3951212167739868e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5687495470046997, | |
| "reward_std": 0.32862186431884766, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5687495470046997, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3444008231163025, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1209.078125, | |
| "completions/min_length": 424.0, | |
| "epoch": 0.4421768707482993, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13074055235654158, | |
| "kl": 0.0014855460885883076, | |
| "learning_rate": 2.9959952104467243e-07, | |
| "loss": 1.4938414096832275e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4377948045730591, | |
| "reward_std": 0.3570207357406616, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4377948045730591, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.37782377004623413, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4741.0, | |
| "completions/mean_length": 1240.36328125, | |
| "completions/min_length": 440.0, | |
| "epoch": 0.445578231292517, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1331763029150445, | |
| "kl": 0.001475883183957194, | |
| "learning_rate": 2.9695137054366753e-07, | |
| "loss": 1.4230608940124512e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5108271241188049, | |
| "reward_std": 0.36244717240333557, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5108271241188049, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.36959731578826904, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6489.0, | |
| "completions/mean_length": 1305.765625, | |
| "completions/min_length": 430.0, | |
| "epoch": 0.4489795918367347, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1290586781632192, | |
| "kl": 0.001309950057475362, | |
| "learning_rate": 2.942977478948057e-07, | |
| "loss": 1.296401023864746e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5338167548179626, | |
| "reward_std": 0.4010734260082245, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5338166952133179, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4047679305076599, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5633.0, | |
| "completions/mean_length": 1277.81640625, | |
| "completions/min_length": 414.0, | |
| "epoch": 0.4523809523809524, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13533104933339096, | |
| "kl": 0.0014049280389372143, | |
| "learning_rate": 2.916389623758636e-07, | |
| "loss": 1.3941898941993713e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4498026967048645, | |
| "reward_std": 0.4285493791103363, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4498027265071869, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4484274387359619, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4146.0, | |
| "completions/mean_length": 1157.87109375, | |
| "completions/min_length": 483.0, | |
| "epoch": 0.4557823129251701, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1429767214006691, | |
| "kl": 0.0016457732672279235, | |
| "learning_rate": 2.889753238663466e-07, | |
| "loss": 1.6689300537109375e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4900895953178406, | |
| "reward_std": 0.4475466310977936, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4900895953178406, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44956454634666443, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3353.0, | |
| "completions/mean_length": 1228.65625, | |
| "completions/min_length": 536.0, | |
| "epoch": 0.45918367346938777, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12608630288605216, | |
| "kl": 0.001589239196619019, | |
| "learning_rate": 2.863071428113726e-07, | |
| "loss": 1.5730038285255432e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.42409783601760864, | |
| "reward_std": 0.4034125804901123, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.42409780621528625, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41312265396118164, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3471.0, | |
| "completions/mean_length": 1234.89453125, | |
| "completions/min_length": 612.0, | |
| "epoch": 0.46258503401360546, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12497242950925527, | |
| "kl": 0.001500208744801057, | |
| "learning_rate": 2.836347301854897e-07, | |
| "loss": 1.475214958190918e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5871116518974304, | |
| "reward_std": 0.2970415949821472, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5871115922927856, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.32421791553497314, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2919.0, | |
| "completions/mean_length": 1207.34765625, | |
| "completions/min_length": 409.0, | |
| "epoch": 0.46598639455782315, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13123736127619876, | |
| "kl": 0.0015240276952681597, | |
| "learning_rate": 2.8095839745643255e-07, | |
| "loss": 1.5515834093093872e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5804826021194458, | |
| "reward_std": 0.3200109302997589, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5804826021194458, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3496645390987396, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4525.0, | |
| "completions/mean_length": 1346.73046875, | |
| "completions/min_length": 507.0, | |
| "epoch": 0.46938775510204084, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12325423385372133, | |
| "kl": 0.0013416068013611948, | |
| "learning_rate": 2.782784565488211e-07, | |
| "loss": 1.3336539268493652e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.3281669020652771, | |
| "reward_std": 0.4673292338848114, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3281669020652771, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.47725093364715576, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4483.0, | |
| "completions/mean_length": 1220.921875, | |
| "completions/min_length": 450.0, | |
| "epoch": 0.47278911564625853, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12885536248220428, | |
| "kl": 0.00149958189103927, | |
| "learning_rate": 2.7559521980780566e-07, | |
| "loss": 1.4808028936386108e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.532761812210083, | |
| "reward_std": 0.34896931052207947, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.532761812210083, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.40563684701919556, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3037.0, | |
| "completions/mean_length": 1221.2890625, | |
| "completions/min_length": 488.0, | |
| "epoch": 0.47619047619047616, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13869574814859498, | |
| "kl": 0.0014890359529999841, | |
| "learning_rate": 2.729089999626637e-07, | |
| "loss": 1.4794059097766876e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4990660846233368, | |
| "reward_std": 0.39672043919563293, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4990660846233368, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41187581419944763, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4350.0, | |
| "completions/mean_length": 1237.12890625, | |
| "completions/min_length": 502.0, | |
| "epoch": 0.47959183673469385, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12395471455250524, | |
| "kl": 0.0014677665994895506, | |
| "learning_rate": 2.7022011009035107e-07, | |
| "loss": 1.3653188943862915e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4990885257720947, | |
| "reward_std": 0.4302549362182617, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.49908849596977234, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43232402205467224, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3966.0, | |
| "completions/mean_length": 1265.9140625, | |
| "completions/min_length": 425.0, | |
| "epoch": 0.48299319727891155, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13012857456910262, | |
| "kl": 0.0014557794766005827, | |
| "learning_rate": 2.675288635790135e-07, | |
| "loss": 1.385807991027832e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4568708539009094, | |
| "reward_std": 0.4049414396286011, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4568708539009094, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4264875650405884, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1365.4765625, | |
| "completions/min_length": 420.0, | |
| "epoch": 0.48639455782312924, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11805162085928556, | |
| "kl": 0.0013945812579549965, | |
| "learning_rate": 2.648355740914613e-07, | |
| "loss": 1.4118850231170654e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4730561375617981, | |
| "reward_std": 0.38758885860443115, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4730561375617981, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.40097248554229736, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3269.0, | |
| "completions/mean_length": 1158.1328125, | |
| "completions/min_length": 486.0, | |
| "epoch": 0.4897959183673469, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13022193041479954, | |
| "kl": 0.0015572439251627657, | |
| "learning_rate": 2.621405555286121e-07, | |
| "loss": 1.6046687960624695e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4237700402736664, | |
| "reward_std": 0.4052298665046692, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4237700402736664, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4302515387535095, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4477.0, | |
| "completions/mean_length": 1271.70703125, | |
| "completions/min_length": 438.0, | |
| "epoch": 0.4931972789115646, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13026005270843405, | |
| "kl": 0.0014919168093001645, | |
| "learning_rate": 2.594441219929058e-07, | |
| "loss": 1.5385448932647705e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4565078318119049, | |
| "reward_std": 0.43446728587150574, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4565078318119049, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4460132122039795, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3277.0, | |
| "completions/mean_length": 1155.765625, | |
| "completions/min_length": 569.0, | |
| "epoch": 0.4965986394557823, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12931491561349248, | |
| "kl": 0.0016008713791961782, | |
| "learning_rate": 2.5674658775169677e-07, | |
| "loss": 1.6149133443832397e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4877603054046631, | |
| "reward_std": 0.41081666946411133, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4877602756023407, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43899431824684143, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3038.0, | |
| "completions/mean_length": 1189.8828125, | |
| "completions/min_length": 449.0, | |
| "epoch": 0.5, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12702661749227706, | |
| "kl": 0.0015393904413940618, | |
| "learning_rate": 2.540482672006254e-07, | |
| "loss": 1.5124678611755371e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.500877857208252, | |
| "reward_std": 0.3198213577270508, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.500877857208252, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3381810188293457, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3720.0, | |
| "completions/mean_length": 1171.10546875, | |
| "completions/min_length": 432.0, | |
| "epoch": 0.5034013605442177, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.134361927289558, | |
| "kl": 0.0016590512541370117, | |
| "learning_rate": 2.513494748269761e-07, | |
| "loss": 1.6801059246063232e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5662176012992859, | |
| "reward_std": 0.3807193636894226, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5662176012992859, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.38985222578048706, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3483.0, | |
| "completions/mean_length": 1202.5234375, | |
| "completions/min_length": 473.0, | |
| "epoch": 0.5068027210884354, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12963409344180143, | |
| "kl": 0.001650203145800333, | |
| "learning_rate": 2.4865052517302394e-07, | |
| "loss": 1.5692785382270813e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5353552103042603, | |
| "reward_std": 0.37545841932296753, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5353552103042603, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3899039626121521, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3251.0, | |
| "completions/mean_length": 1229.546875, | |
| "completions/min_length": 406.0, | |
| "epoch": 0.5102040816326531, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12807932315566303, | |
| "kl": 0.001541927182188374, | |
| "learning_rate": 2.459517327993746e-07, | |
| "loss": 1.5497207641601562e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4594876766204834, | |
| "reward_std": 0.3596667945384979, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4594876766204834, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39366379380226135, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4203.0, | |
| "completions/mean_length": 1162.60546875, | |
| "completions/min_length": 493.0, | |
| "epoch": 0.5136054421768708, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13195509056801039, | |
| "kl": 0.0017888068650790956, | |
| "learning_rate": 2.4325341224830326e-07, | |
| "loss": 1.7918646335601807e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.7158861756324768, | |
| "reward_std": 0.2936908006668091, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.7158861756324768, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3239665925502777, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3430.0, | |
| "completions/mean_length": 1246.49609375, | |
| "completions/min_length": 502.0, | |
| "epoch": 0.5170068027210885, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12349750689603009, | |
| "kl": 0.0015071489342517452, | |
| "learning_rate": 2.405558780070942e-07, | |
| "loss": 1.4919787645339966e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.47492241859436035, | |
| "reward_std": 0.4099407494068146, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.47492241859436035, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4219662547111511, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3985.0, | |
| "completions/mean_length": 1217.10546875, | |
| "completions/min_length": 489.0, | |
| "epoch": 0.5204081632653061, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14744966056557027, | |
| "kl": 0.0015069277460497688, | |
| "learning_rate": 2.37859444471388e-07, | |
| "loss": 1.4007091522216797e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.39524000883102417, | |
| "reward_std": 0.4498276710510254, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.3952399790287018, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4426690936088562, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4924.0, | |
| "completions/mean_length": 1208.10546875, | |
| "completions/min_length": 449.0, | |
| "epoch": 0.5238095238095238, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1270108879721089, | |
| "kl": 0.0017198402274516411, | |
| "learning_rate": 2.3516442590853866e-07, | |
| "loss": 1.7210841178894043e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5516296625137329, | |
| "reward_std": 0.410399854183197, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5516296625137329, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.40337714552879333, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6468.0, | |
| "completions/mean_length": 1229.87890625, | |
| "completions/min_length": 399.0, | |
| "epoch": 0.5272108843537415, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13149493698674622, | |
| "kl": 0.0015122121412787237, | |
| "learning_rate": 2.3247113642098645e-07, | |
| "loss": 1.5497207641601562e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.476318895816803, | |
| "reward_std": 0.42769843339920044, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.476318895816803, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4319699704647064, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1269.3125, | |
| "completions/min_length": 429.0, | |
| "epoch": 0.5306122448979592, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1304574189322417, | |
| "kl": 0.0015285064409908955, | |
| "learning_rate": 2.2977988990964896e-07, | |
| "loss": 1.5534460544586182e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6005215048789978, | |
| "reward_std": 0.32493290305137634, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6005215048789978, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.34041628241539, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4384.0, | |
| "completions/mean_length": 1235.43359375, | |
| "completions/min_length": 593.0, | |
| "epoch": 0.5340136054421769, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12127411001199988, | |
| "kl": 0.0016412290724474587, | |
| "learning_rate": 2.2709100003733634e-07, | |
| "loss": 1.519918441772461e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5233211517333984, | |
| "reward_std": 0.3616924583911896, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5233211517333984, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3967532515525818, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5871.0, | |
| "completions/mean_length": 1226.09375, | |
| "completions/min_length": 506.0, | |
| "epoch": 0.5374149659863946, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12594349415909045, | |
| "kl": 0.0013712193344872503, | |
| "learning_rate": 2.2440478019219437e-07, | |
| "loss": 1.3541430234909058e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5536775588989258, | |
| "reward_std": 0.34202492237091064, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5536775588989258, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.35093897581100464, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2961.0, | |
| "completions/mean_length": 1209.2421875, | |
| "completions/min_length": 504.0, | |
| "epoch": 0.5408163265306123, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.127306231377746, | |
| "kl": 0.0014342863169076736, | |
| "learning_rate": 2.2172154345117894e-07, | |
| "loss": 1.430511474609375e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5524036884307861, | |
| "reward_std": 0.32542017102241516, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5524036884307861, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.333621621131897, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3949.0, | |
| "completions/mean_length": 1182.046875, | |
| "completions/min_length": 472.0, | |
| "epoch": 0.54421768707483, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.130686322143201, | |
| "kl": 0.0014794531780353282, | |
| "learning_rate": 2.1904160254356748e-07, | |
| "loss": 1.4044344425201416e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6170496344566345, | |
| "reward_std": 0.3277736008167267, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6170496344566345, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3628644049167633, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2928.0, | |
| "completions/mean_length": 1152.2265625, | |
| "completions/min_length": 511.0, | |
| "epoch": 0.5476190476190477, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.132823142268967, | |
| "kl": 0.0014511521703752805, | |
| "learning_rate": 2.1636526981451036e-07, | |
| "loss": 1.5888363122940063e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5477647185325623, | |
| "reward_std": 0.3859521746635437, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5477646589279175, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4129984378814697, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4990.0, | |
| "completions/mean_length": 1285.828125, | |
| "completions/min_length": 467.0, | |
| "epoch": 0.5510204081632653, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12785688650096694, | |
| "kl": 0.0013251102805043047, | |
| "learning_rate": 2.1369285718862748e-07, | |
| "loss": 1.259148120880127e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5367299914360046, | |
| "reward_std": 0.3701598346233368, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5367300510406494, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3895636796951294, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4772.0, | |
| "completions/mean_length": 1222.5625, | |
| "completions/min_length": 446.0, | |
| "epoch": 0.5544217687074829, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1330333320092087, | |
| "kl": 0.0012835902516599162, | |
| "learning_rate": 2.1102467613365334e-07, | |
| "loss": 1.3317912817001343e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5249505043029785, | |
| "reward_std": 0.383068710565567, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5249505043029785, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39008262753486633, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2926.0, | |
| "completions/mean_length": 1287.45703125, | |
| "completions/min_length": 528.0, | |
| "epoch": 0.5578231292517006, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1407269238620054, | |
| "kl": 0.0013230719678176683, | |
| "learning_rate": 2.0836103762413638e-07, | |
| "loss": 1.344829797744751e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5449553728103638, | |
| "reward_std": 0.4117432236671448, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5449553728103638, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4277450442314148, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3309.0, | |
| "completions/mean_length": 1280.25390625, | |
| "completions/min_length": 509.0, | |
| "epoch": 0.5612244897959183, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12404282707169718, | |
| "kl": 0.0013554931147155003, | |
| "learning_rate": 2.0570225210519433e-07, | |
| "loss": 1.3336539268493652e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5558387041091919, | |
| "reward_std": 0.3796404004096985, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5558386445045471, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41854989528656006, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3314.0, | |
| "completions/mean_length": 1228.93359375, | |
| "completions/min_length": 471.0, | |
| "epoch": 0.564625850340136, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12571021073988586, | |
| "kl": 0.0013914067976656952, | |
| "learning_rate": 2.0304862945633247e-07, | |
| "loss": 1.4193356037139893e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5028097033500671, | |
| "reward_std": 0.36212992668151855, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5028097033500671, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.36804434657096863, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3651.0, | |
| "completions/mean_length": 1311.5703125, | |
| "completions/min_length": 564.0, | |
| "epoch": 0.5680272108843537, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12460650819042984, | |
| "kl": 0.0012771672645612853, | |
| "learning_rate": 2.0040047895532752e-07, | |
| "loss": 1.2405216693878174e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.501116931438446, | |
| "reward_std": 0.3999606668949127, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.501116931438446, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4230176508426666, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1383.45703125, | |
| "completions/min_length": 521.0, | |
| "epoch": 0.5714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1184463202674069, | |
| "kl": 0.001180576521619514, | |
| "learning_rate": 1.977581092421812e-07, | |
| "loss": 1.173466444015503e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.39330971240997314, | |
| "reward_std": 0.4362216889858246, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.39330971240997314, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4408242404460907, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3373.0, | |
| "completions/mean_length": 1204.75390625, | |
| "completions/min_length": 447.0, | |
| "epoch": 0.5748299319727891, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12632851795944025, | |
| "kl": 0.0014828696912445594, | |
| "learning_rate": 1.9512182828314882e-07, | |
| "loss": 1.475214958190918e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5679004788398743, | |
| "reward_std": 0.33988669514656067, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5679004192352295, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.36474913358688354, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3822.0, | |
| "completions/mean_length": 1174.91015625, | |
| "completions/min_length": 435.0, | |
| "epoch": 0.5782312925170068, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1272662009578314, | |
| "kl": 0.001407989510880725, | |
| "learning_rate": 1.9249194333484563e-07, | |
| "loss": 1.3150274753570557e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6547111868858337, | |
| "reward_std": 0.25977566838264465, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6547111868858337, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.2951057255268097, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2913.0, | |
| "completions/mean_length": 1225.19140625, | |
| "completions/min_length": 564.0, | |
| "epoch": 0.5816326530612245, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12591804144081578, | |
| "kl": 0.0013545075703405018, | |
| "learning_rate": 1.8986876090843664e-07, | |
| "loss": 1.4118850231170654e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5509231686592102, | |
| "reward_std": 0.31451380252838135, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5509231090545654, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.34253573417663574, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5525.0, | |
| "completions/mean_length": 1350.0078125, | |
| "completions/min_length": 545.0, | |
| "epoch": 0.5850340136054422, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1184334593846969, | |
| "kl": 0.0012318995350142359, | |
| "learning_rate": 1.872525867339128e-07, | |
| "loss": 1.2330710887908936e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5996668338775635, | |
| "reward_std": 0.35077401995658875, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5996668338775635, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.36193662881851196, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4773.0, | |
| "completions/mean_length": 1289.87109375, | |
| "completions/min_length": 497.0, | |
| "epoch": 0.5884353741496599, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13773093289384147, | |
| "kl": 0.0013899512669013347, | |
| "learning_rate": 1.8464372572445863e-07, | |
| "loss": 1.434236764907837e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5270853638648987, | |
| "reward_std": 0.40748390555381775, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5270853638648987, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41710126399993896, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3433.0, | |
| "completions/mean_length": 1167.64453125, | |
| "completions/min_length": 505.0, | |
| "epoch": 0.5918367346938775, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12112995046800766, | |
| "kl": 0.0013812177198815334, | |
| "learning_rate": 1.8204248194091425e-07, | |
| "loss": 1.344829797744751e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6126636862754822, | |
| "reward_std": 0.3266138434410095, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6126636862754822, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3397049903869629, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3811.0, | |
| "completions/mean_length": 1326.52734375, | |
| "completions/min_length": 448.0, | |
| "epoch": 0.5952380952380952, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.129796948317958, | |
| "kl": 0.001184793375159643, | |
| "learning_rate": 1.7944915855633807e-07, | |
| "loss": 1.3085082173347473e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.44264623522758484, | |
| "reward_std": 0.3939417004585266, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.44264620542526245, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42343512177467346, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3019.0, | |
| "completions/mean_length": 1266.4609375, | |
| "completions/min_length": 404.0, | |
| "epoch": 0.5986394557823129, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1207800852669321, | |
| "kl": 0.0013622808933178021, | |
| "learning_rate": 1.768640578206715e-07, | |
| "loss": 1.2665987014770508e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5672487616539001, | |
| "reward_std": 0.3637806475162506, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5672487616539001, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39485296607017517, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3501.0, | |
| "completions/mean_length": 1274.89453125, | |
| "completions/min_length": 405.0, | |
| "epoch": 0.6020408163265306, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11969351864846854, | |
| "kl": 0.0013109368346704287, | |
| "learning_rate": 1.7428748102551234e-07, | |
| "loss": 1.3485550880432129e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5126824378967285, | |
| "reward_std": 0.360828161239624, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5126823782920837, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.37800562381744385, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3133.0, | |
| "completions/mean_length": 1312.5078125, | |
| "completions/min_length": 424.0, | |
| "epoch": 0.6054421768707483, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1199661599278971, | |
| "kl": 0.0012908925173178432, | |
| "learning_rate": 1.7171972846899941e-07, | |
| "loss": 1.259148120880127e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5275952816009521, | |
| "reward_std": 0.38074105978012085, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5275952816009521, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39470240473747253, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5515.0, | |
| "completions/mean_length": 1352.41015625, | |
| "completions/min_length": 405.0, | |
| "epoch": 0.608843537414966, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12621918244168606, | |
| "kl": 0.0012085978110008, | |
| "learning_rate": 1.691610994208129e-07, | |
| "loss": 1.2516975402832031e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.46217256784439087, | |
| "reward_std": 0.416616827249527, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.46217256784439087, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.40671467781066895, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2739.0, | |
| "completions/mean_length": 1107.76171875, | |
| "completions/min_length": 435.0, | |
| "epoch": 0.6122448979591837, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13966438978348047, | |
| "kl": 0.001379939973958244, | |
| "learning_rate": 1.6661189208729489e-07, | |
| "loss": 1.5422701835632324e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6245121359825134, | |
| "reward_std": 0.3324832022190094, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6245121359825134, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3664103150367737, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2833.0, | |
| "completions/mean_length": 1239.33203125, | |
| "completions/min_length": 295.0, | |
| "epoch": 0.6156462585034014, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12426157616035632, | |
| "kl": 0.0013459335514198756, | |
| "learning_rate": 1.6407240357669332e-07, | |
| "loss": 1.4137476682662964e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5082270503044128, | |
| "reward_std": 0.3264538645744324, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5082271099090576, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3465285301208496, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3867.0, | |
| "completions/mean_length": 1174.68359375, | |
| "completions/min_length": 446.0, | |
| "epoch": 0.6190476190476191, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13358274021087882, | |
| "kl": 0.00134277267534344, | |
| "learning_rate": 1.6154292986453483e-07, | |
| "loss": 1.3802200555801392e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5906236171722412, | |
| "reward_std": 0.34040746092796326, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5906236171722412, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3793966770172119, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1359.40234375, | |
| "completions/min_length": 454.0, | |
| "epoch": 0.6224489795918368, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12503538969545513, | |
| "kl": 0.0013009100903218496, | |
| "learning_rate": 1.5902376575912814e-07, | |
| "loss": 1.3336539268493652e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5492093563079834, | |
| "reward_std": 0.3843424916267395, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5492092967033386, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41074615716934204, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2676.0, | |
| "completions/mean_length": 1193.87109375, | |
| "completions/min_length": 499.0, | |
| "epoch": 0.6258503401360545, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1208639891379796, | |
| "kl": 0.0013548607312259264, | |
| "learning_rate": 1.5651520486720516e-07, | |
| "loss": 1.3280659914016724e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5216140151023865, | |
| "reward_std": 0.32233840227127075, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5216140151023865, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3560025990009308, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3305.0, | |
| "completions/mean_length": 1245.7109375, | |
| "completions/min_length": 494.0, | |
| "epoch": 0.6292517006802721, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12686236905432466, | |
| "kl": 0.0012695216200881987, | |
| "learning_rate": 1.5401753955970097e-07, | |
| "loss": 1.30385160446167e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5973302721977234, | |
| "reward_std": 0.3663652539253235, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5973303318023682, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3876189589500427, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3152.0, | |
| "completions/mean_length": 1213.82421875, | |
| "completions/min_length": 502.0, | |
| "epoch": 0.6326530612244898, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12885973798806175, | |
| "kl": 0.001346839189864113, | |
| "learning_rate": 1.5153106093767825e-07, | |
| "loss": 1.3206154108047485e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6149528622627258, | |
| "reward_std": 0.3335758447647095, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6149528622627258, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3618091940879822, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4436.0, | |
| "completions/mean_length": 1248.99609375, | |
| "completions/min_length": 466.0, | |
| "epoch": 0.6360544217687075, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12657431869640567, | |
| "kl": 0.0012566588084155228, | |
| "learning_rate": 1.490560587983996e-07, | |
| "loss": 1.126900315284729e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6170510649681091, | |
| "reward_std": 0.28795865178108215, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6170510649681091, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3206290006637573, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3711.0, | |
| "completions/mean_length": 1272.578125, | |
| "completions/min_length": 491.0, | |
| "epoch": 0.6394557823129252, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12319663586080123, | |
| "kl": 0.0012713064520539774, | |
| "learning_rate": 1.465928216015522e-07, | |
| "loss": 1.1995434761047363e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5160113573074341, | |
| "reward_std": 0.38475197553634644, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5160113573074341, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41495686769485474, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3447.0, | |
| "completions/mean_length": 1222.0703125, | |
| "completions/min_length": 443.0, | |
| "epoch": 0.6428571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12426694619632161, | |
| "kl": 0.001386047030791815, | |
| "learning_rate": 1.4414163643562753e-07, | |
| "loss": 1.4081597328186035e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5270590782165527, | |
| "reward_std": 0.3698263168334961, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5270590782165527, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3735743463039398, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3303.0, | |
| "completions/mean_length": 1276.09375, | |
| "completions/min_length": 372.0, | |
| "epoch": 0.6462585034013606, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13033658908632512, | |
| "kl": 0.0013309565947565716, | |
| "learning_rate": 1.4170278898446175e-07, | |
| "loss": 1.3373792171478271e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5525405406951904, | |
| "reward_std": 0.4099390506744385, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5525405406951904, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42479801177978516, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3087.0, | |
| "completions/mean_length": 1215.47265625, | |
| "completions/min_length": 481.0, | |
| "epoch": 0.6496598639455783, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13086596111770168, | |
| "kl": 0.0013185567531763809, | |
| "learning_rate": 1.3927656349393952e-07, | |
| "loss": 1.3075768947601318e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5212171673774719, | |
| "reward_std": 0.4619579315185547, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5212171077728271, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.45611312985420227, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3030.0, | |
| "completions/mean_length": 1284.39453125, | |
| "completions/min_length": 457.0, | |
| "epoch": 0.6530612244897959, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12121365836513875, | |
| "kl": 0.0012196112975288997, | |
| "learning_rate": 1.3686324273886528e-07, | |
| "loss": 1.2903474271297455e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5198047161102295, | |
| "reward_std": 0.408640056848526, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5198047161102295, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4179668426513672, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4217.0, | |
| "completions/mean_length": 1236.8125, | |
| "completions/min_length": 503.0, | |
| "epoch": 0.6564625850340136, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11991459011860725, | |
| "kl": 0.0013542722726924694, | |
| "learning_rate": 1.3446310799000575e-07, | |
| "loss": 1.2852251529693604e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.672049880027771, | |
| "reward_std": 0.33700209856033325, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6720498204231262, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.37094196677207947, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3159.0, | |
| "completions/mean_length": 1212.5703125, | |
| "completions/min_length": 509.0, | |
| "epoch": 0.6598639455782312, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13003630784097575, | |
| "kl": 0.0013068479593130178, | |
| "learning_rate": 1.3207643898130853e-07, | |
| "loss": 1.2461096048355103e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5808924436569214, | |
| "reward_std": 0.3667425215244293, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5808924436569214, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4038356840610504, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3690.0, | |
| "completions/mean_length": 1275.81640625, | |
| "completions/min_length": 414.0, | |
| "epoch": 0.6632653061224489, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13731411089997758, | |
| "kl": 0.001287659304580302, | |
| "learning_rate": 1.2970351387729872e-07, | |
| "loss": 1.2349337339401245e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5368715524673462, | |
| "reward_std": 0.3973066806793213, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5368715524673462, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4072633683681488, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3239.0, | |
| "completions/mean_length": 1227.8125, | |
| "completions/min_length": 412.0, | |
| "epoch": 0.6666666666666666, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13545176265600176, | |
| "kl": 0.0013441352293739328, | |
| "learning_rate": 1.273446092406599e-07, | |
| "loss": 1.2777745723724365e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5994586944580078, | |
| "reward_std": 0.36241117119789124, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5994586944580078, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.36684757471084595, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2883.0, | |
| "completions/mean_length": 1199.0234375, | |
| "completions/min_length": 453.0, | |
| "epoch": 0.6700680272108843, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13288052160082212, | |
| "kl": 0.001319561021773552, | |
| "learning_rate": 1.2500000000000005e-07, | |
| "loss": 1.4156103134155273e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5040369033813477, | |
| "reward_std": 0.40427684783935547, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5040369033813477, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4201582968235016, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2679.0, | |
| "completions/mean_length": 1189.41796875, | |
| "completions/min_length": 480.0, | |
| "epoch": 0.673469387755102, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12608554405627326, | |
| "kl": 0.0013950904140074272, | |
| "learning_rate": 1.2266995941780933e-07, | |
| "loss": 1.4156103134155273e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5608351230621338, | |
| "reward_std": 0.2745205760002136, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5608351230621338, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.2968989610671997, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3910.0, | |
| "completions/mean_length": 1240.84765625, | |
| "completions/min_length": 452.0, | |
| "epoch": 0.6768707482993197, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12443194247315312, | |
| "kl": 0.0012590040068971575, | |
| "learning_rate": 1.2035475905861134e-07, | |
| "loss": 1.3299286365509033e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5148465633392334, | |
| "reward_std": 0.35695213079452515, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5148465633392334, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.38320231437683105, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4820.0, | |
| "completions/mean_length": 1296.47265625, | |
| "completions/min_length": 470.0, | |
| "epoch": 0.6802721088435374, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12668886855433553, | |
| "kl": 0.0012762437327182852, | |
| "learning_rate": 1.1805466875731276e-07, | |
| "loss": 1.3140961527824402e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.43766123056411743, | |
| "reward_std": 0.41216611862182617, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.43766123056411743, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41272807121276855, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3375.0, | |
| "completions/mean_length": 1267.12890625, | |
| "completions/min_length": 451.0, | |
| "epoch": 0.6836734693877551, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12284357535625699, | |
| "kl": 0.0013010797565584653, | |
| "learning_rate": 1.1576995658775404e-07, | |
| "loss": 1.2200325727462769e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6425645351409912, | |
| "reward_std": 0.39451929926872253, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6425645351409912, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3956354558467865, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5039.0, | |
| "completions/mean_length": 1187.60546875, | |
| "completions/min_length": 477.0, | |
| "epoch": 0.6870748299319728, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1329605029871459, | |
| "kl": 0.0013695297720914823, | |
| "learning_rate": 1.1350088883146547e-07, | |
| "loss": 1.475214958190918e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5215319991111755, | |
| "reward_std": 0.3130980134010315, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5215319395065308, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.34181302785873413, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3808.0, | |
| "completions/mean_length": 1268.234375, | |
| "completions/min_length": 530.0, | |
| "epoch": 0.6904761904761905, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12558617089585156, | |
| "kl": 0.001284107916035282, | |
| "learning_rate": 1.1124772994663256e-07, | |
| "loss": 1.2665987014770508e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.48843446373939514, | |
| "reward_std": 0.3897768259048462, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.48843443393707275, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3982493281364441, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1252.0703125, | |
| "completions/min_length": 444.0, | |
| "epoch": 0.6938775510204082, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13137983301363407, | |
| "kl": 0.0013014281812502304, | |
| "learning_rate": 1.0901074253727336e-07, | |
| "loss": 1.2293457984924316e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.511155366897583, | |
| "reward_std": 0.340930312871933, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.511155366897583, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3543807566165924, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6128.0, | |
| "completions/mean_length": 1238.28515625, | |
| "completions/min_length": 479.0, | |
| "epoch": 0.6972789115646258, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13356187086446547, | |
| "kl": 0.0013667331368196756, | |
| "learning_rate": 1.0679018732263257e-07, | |
| "loss": 1.3671815395355225e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5958702564239502, | |
| "reward_std": 0.39605003595352173, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5958702564239502, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4235108494758606, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3230.0, | |
| "completions/mean_length": 1246.95703125, | |
| "completions/min_length": 480.0, | |
| "epoch": 0.7006802721088435, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12808594455299088, | |
| "kl": 0.0013271556008476182, | |
| "learning_rate": 1.0458632310679438e-07, | |
| "loss": 1.4379620552062988e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6012597680091858, | |
| "reward_std": 0.3364196717739105, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6012598276138306, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3379861116409302, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3196.0, | |
| "completions/mean_length": 1140.3046875, | |
| "completions/min_length": 439.0, | |
| "epoch": 0.7040816326530612, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12792788035068012, | |
| "kl": 0.0013527601777241216, | |
| "learning_rate": 1.0239940674851941e-07, | |
| "loss": 1.3522803783416748e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5495452880859375, | |
| "reward_std": 0.3822019696235657, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5495453476905823, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3915736973285675, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1340.83984375, | |
| "completions/min_length": 539.0, | |
| "epoch": 0.7074829931972789, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12050380420709206, | |
| "kl": 0.0012090708578398335, | |
| "learning_rate": 1.0022969313130773e-07, | |
| "loss": 1.1548399925231934e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.47699809074401855, | |
| "reward_std": 0.33950385451316833, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.47699812054634094, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3521897494792938, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5988.0, | |
| "completions/mean_length": 1326.73828125, | |
| "completions/min_length": 482.0, | |
| "epoch": 0.7108843537414966, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12634413168781206, | |
| "kl": 0.0012325601328484481, | |
| "learning_rate": 9.80774351336927e-08, | |
| "loss": 1.2740492820739746e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.448714017868042, | |
| "reward_std": 0.43655937910079956, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.448714017868042, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42768001556396484, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2801.0, | |
| "completions/mean_length": 1206.16796875, | |
| "completions/min_length": 463.0, | |
| "epoch": 0.7142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1306756891257428, | |
| "kl": 0.0013014997630307334, | |
| "learning_rate": 9.594288359976815e-08, | |
| "loss": 1.3671815395355225e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5778207778930664, | |
| "reward_std": 0.410552442073822, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5778207182884216, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42478153109550476, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4431.0, | |
| "completions/mean_length": 1307.96875, | |
| "completions/min_length": 466.0, | |
| "epoch": 0.717687074829932, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12608798208719338, | |
| "kl": 0.0012636054516406148, | |
| "learning_rate": 9.38262873099522e-08, | |
| "loss": 1.3262033462524414e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.41280120611190796, | |
| "reward_std": 0.4218251705169678, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.41280120611190796, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43390703201293945, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3796.0, | |
| "completions/mean_length": 1275.0859375, | |
| "completions/min_length": 506.0, | |
| "epoch": 0.7210884353741497, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12194629809618202, | |
| "kl": 0.0012277502910365001, | |
| "learning_rate": 9.172789295199254e-08, | |
| "loss": 1.173466444015503e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5452022552490234, | |
| "reward_std": 0.42787063121795654, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5452022552490234, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4353907108306885, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3460.0, | |
| "completions/mean_length": 1236.53125, | |
| "completions/min_length": 446.0, | |
| "epoch": 0.7244897959183674, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.14036928021373707, | |
| "kl": 0.0013081711795166484, | |
| "learning_rate": 8.964794509221507e-08, | |
| "loss": 1.3136304914951324e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6131947040557861, | |
| "reward_std": 0.39594751596450806, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6131947040557861, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4020529091358185, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5378.0, | |
| "completions/mean_length": 1286.6015625, | |
| "completions/min_length": 437.0, | |
| "epoch": 0.7278911564625851, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12787518883894636, | |
| "kl": 0.001297479861023021, | |
| "learning_rate": 8.758668614701972e-08, | |
| "loss": 1.2405216693878174e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5273051857948303, | |
| "reward_std": 0.42367199063301086, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5273051261901855, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4331037104129791, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3119.0, | |
| "completions/mean_length": 1257.484375, | |
| "completions/min_length": 455.0, | |
| "epoch": 0.7312925170068028, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.11980010487610358, | |
| "kl": 0.0013178105773477, | |
| "learning_rate": 8.55443563546274e-08, | |
| "loss": 1.3113021850585938e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4529898166656494, | |
| "reward_std": 0.41719964146614075, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4529898166656494, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.46845582127571106, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3151.0, | |
| "completions/mean_length": 1218.0078125, | |
| "completions/min_length": 430.0, | |
| "epoch": 0.7346938775510204, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1331665548979272, | |
| "kl": 0.0013294228419908904, | |
| "learning_rate": 8.352119374707977e-08, | |
| "loss": 1.2814998626708984e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6287795305252075, | |
| "reward_std": 0.2791963517665863, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6287795305252075, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3135279417037964, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3813.0, | |
| "completions/mean_length": 1320.07421875, | |
| "completions/min_length": 381.0, | |
| "epoch": 0.7380952380952381, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13325778517888945, | |
| "kl": 0.001306778119669616, | |
| "learning_rate": 8.151743412249728e-08, | |
| "loss": 1.3438984751701355e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5232036709785461, | |
| "reward_std": 0.3668539822101593, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5232036113739014, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4027640223503113, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4038.0, | |
| "completions/mean_length": 1262.80859375, | |
| "completions/min_length": 521.0, | |
| "epoch": 0.7414965986394558, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1226930510944803, | |
| "kl": 0.0013263678656585398, | |
| "learning_rate": 7.953331101759705e-08, | |
| "loss": 1.389533281326294e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6287027597427368, | |
| "reward_std": 0.34670159220695496, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6287027597427368, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.36597710847854614, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4104.0, | |
| "completions/mean_length": 1232.12109375, | |
| "completions/min_length": 429.0, | |
| "epoch": 0.7448979591836735, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12901763117209306, | |
| "kl": 0.0013840211349815945, | |
| "learning_rate": 7.756905568047392e-08, | |
| "loss": 1.3485550880432129e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5536688566207886, | |
| "reward_std": 0.3205341696739197, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5536688566207886, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3282807171344757, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2886.0, | |
| "completions/mean_length": 1263.96875, | |
| "completions/min_length": 482.0, | |
| "epoch": 0.7482993197278912, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12890254868272635, | |
| "kl": 0.0014110473175605875, | |
| "learning_rate": 7.56248970436493e-08, | |
| "loss": 1.4100223779678345e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6479886770248413, | |
| "reward_std": 0.35511314868927, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6479886770248413, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3817684054374695, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5501.0, | |
| "completions/mean_length": 1255.84765625, | |
| "completions/min_length": 455.0, | |
| "epoch": 0.7517006802721088, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12610719119192687, | |
| "kl": 0.0014363232407959003, | |
| "learning_rate": 7.37010616973886e-08, | |
| "loss": 1.2908130884170532e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5539280772209167, | |
| "reward_std": 0.4543180465698242, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.553928017616272, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4694295823574066, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3728.0, | |
| "completions/mean_length": 1300.24609375, | |
| "completions/min_length": 513.0, | |
| "epoch": 0.7551020408163265, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12843810609713965, | |
| "kl": 0.0012828246744902572, | |
| "learning_rate": 7.179777386329275e-08, | |
| "loss": 1.280568540096283e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5100784301757812, | |
| "reward_std": 0.40308377146720886, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5100784301757812, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4186839163303375, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3139.0, | |
| "completions/mean_length": 1211.234375, | |
| "completions/min_length": 432.0, | |
| "epoch": 0.7585034013605442, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13185135148687682, | |
| "kl": 0.001326795773820777, | |
| "learning_rate": 6.991525536816497e-08, | |
| "loss": 1.34296715259552e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.46107718348503113, | |
| "reward_std": 0.3825857639312744, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.46107718348503113, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3778640627861023, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3264.0, | |
| "completions/mean_length": 1196.265625, | |
| "completions/min_length": 503.0, | |
| "epoch": 0.7619047619047619, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13298086952113006, | |
| "kl": 0.0014002320585859707, | |
| "learning_rate": 6.805372561815767e-08, | |
| "loss": 1.369975507259369e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5463467240333557, | |
| "reward_std": 0.4121183753013611, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5463467240333557, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42288950085639954, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1262.1328125, | |
| "completions/min_length": 456.0, | |
| "epoch": 0.7653061224489796, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1255035449090627, | |
| "kl": 0.0013547614798881114, | |
| "learning_rate": 6.621340157319996e-08, | |
| "loss": 1.2889504432678223e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5293986797332764, | |
| "reward_std": 0.3732859492301941, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5293987393379211, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.40485361218452454, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3308.0, | |
| "completions/mean_length": 1212.421875, | |
| "completions/min_length": 462.0, | |
| "epoch": 0.7687074829931972, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12740333260958453, | |
| "kl": 0.001389774693052459, | |
| "learning_rate": 6.439449772171162e-08, | |
| "loss": 1.3522803783416748e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5247737169265747, | |
| "reward_std": 0.31403160095214844, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5247736573219299, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3341880440711975, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4311.0, | |
| "completions/mean_length": 1256.6953125, | |
| "completions/min_length": 519.0, | |
| "epoch": 0.7721088435374149, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1298841142351844, | |
| "kl": 0.0013443967050079664, | |
| "learning_rate": 6.259722605560488e-08, | |
| "loss": 1.4165416359901428e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.515049934387207, | |
| "reward_std": 0.43085336685180664, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.515049934387207, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4393814206123352, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4830.0, | |
| "completions/mean_length": 1255.2890625, | |
| "completions/min_length": 448.0, | |
| "epoch": 0.7755102040816326, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12858098027696746, | |
| "kl": 0.0012866992628914886, | |
| "learning_rate": 6.082179604557616e-08, | |
| "loss": 1.3224780559539795e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5552853345870972, | |
| "reward_std": 0.34694957733154297, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5552853345870972, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.36402466893196106, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3143.0, | |
| "completions/mean_length": 1254.99609375, | |
| "completions/min_length": 450.0, | |
| "epoch": 0.7789115646258503, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12893806348706394, | |
| "kl": 0.0012813517714675982, | |
| "learning_rate": 5.9068414616693266e-08, | |
| "loss": 1.2330710887908936e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.46818554401397705, | |
| "reward_std": 0.3551763892173767, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.46818554401397705, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.36898866295814514, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3138.0, | |
| "completions/mean_length": 1278.76171875, | |
| "completions/min_length": 507.0, | |
| "epoch": 0.782312925170068, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12421445570047403, | |
| "kl": 0.0013655927568834159, | |
| "learning_rate": 5.733728612427771e-08, | |
| "loss": 1.5338882803916931e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6461366415023804, | |
| "reward_std": 0.33448177576065063, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6461366415023804, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3466874659061432, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1294.75, | |
| "completions/min_length": 484.0, | |
| "epoch": 0.7857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12127892715668272, | |
| "kl": 0.0013997323094372405, | |
| "learning_rate": 5.5628612330087724e-08, | |
| "loss": 1.4491379261016846e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5366698503494263, | |
| "reward_std": 0.410519540309906, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5366698503494263, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42046892642974854, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3791.0, | |
| "completions/mean_length": 1275.3515625, | |
| "completions/min_length": 488.0, | |
| "epoch": 0.7891156462585034, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12653282121251824, | |
| "kl": 0.001317089796430082, | |
| "learning_rate": 5.394259237880272e-08, | |
| "loss": 1.3327226042747498e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5845522880554199, | |
| "reward_std": 0.31545960903167725, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5845522284507751, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3370753824710846, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5526.0, | |
| "completions/mean_length": 1302.609375, | |
| "completions/min_length": 542.0, | |
| "epoch": 0.7925170068027211, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13048198986020212, | |
| "kl": 0.0012734234924209886, | |
| "learning_rate": 5.227942277481362e-08, | |
| "loss": 1.296401023864746e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.41471242904663086, | |
| "reward_std": 0.4190084636211395, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.41471242904663086, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42466604709625244, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3382.0, | |
| "completions/mean_length": 1286.8125, | |
| "completions/min_length": 548.0, | |
| "epoch": 0.7959183673469388, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1275063488377491, | |
| "kl": 0.0013080939488645527, | |
| "learning_rate": 5.0639297359319846e-08, | |
| "loss": 1.3969838619232178e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5419309139251709, | |
| "reward_std": 0.4084588289260864, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5419309139251709, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42553380131721497, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3599.0, | |
| "completions/mean_length": 1292.29296875, | |
| "completions/min_length": 517.0, | |
| "epoch": 0.7993197278911565, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12339984082764446, | |
| "kl": 0.0012895453537566937, | |
| "learning_rate": 4.902240728773749e-08, | |
| "loss": 1.2051314115524292e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5682984590530396, | |
| "reward_std": 0.2678278088569641, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5682984590530396, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.29323381185531616, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3072.0, | |
| "completions/mean_length": 1364.32421875, | |
| "completions/min_length": 507.0, | |
| "epoch": 0.8027210884353742, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12114741624083838, | |
| "kl": 0.00119225540493062, | |
| "learning_rate": 4.742894100742062e-08, | |
| "loss": 1.1418014764785767e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.41127753257751465, | |
| "reward_std": 0.40841516852378845, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.41127756237983704, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4287666976451874, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3509.0, | |
| "completions/mean_length": 1307.91015625, | |
| "completions/min_length": 449.0, | |
| "epoch": 0.8061224489795918, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1332653805549814, | |
| "kl": 0.0013080557828288875, | |
| "learning_rate": 4.5859084235697235e-08, | |
| "loss": 1.3187527656555176e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.48149698972702026, | |
| "reward_std": 0.43754392862319946, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.48149701952934265, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44631412625312805, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5070.0, | |
| "completions/mean_length": 1316.9921875, | |
| "completions/min_length": 465.0, | |
| "epoch": 0.8095238095238095, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12326346970337854, | |
| "kl": 0.0012852595964432112, | |
| "learning_rate": 4.43130199382247e-08, | |
| "loss": 1.4621764421463013e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.550597071647644, | |
| "reward_std": 0.4328611493110657, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.550597071647644, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4346959590911865, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6591.0, | |
| "completions/mean_length": 1328.93359375, | |
| "completions/min_length": 456.0, | |
| "epoch": 0.8129251700680272, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12092938068649253, | |
| "kl": 0.0012998831398363109, | |
| "learning_rate": 4.2790928307664706e-08, | |
| "loss": 1.2703239917755127e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4453105926513672, | |
| "reward_std": 0.48312580585479736, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4453105926513672, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4780498147010803, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3332.0, | |
| "completions/mean_length": 1213.22265625, | |
| "completions/min_length": 509.0, | |
| "epoch": 0.8163265306122449, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1316405251458809, | |
| "kl": 0.0013398944165601279, | |
| "learning_rate": 4.1292986742682254e-08, | |
| "loss": 1.3709068298339844e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6118814945220947, | |
| "reward_std": 0.30538809299468994, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6118814945220947, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.34404152631759644, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2994.0, | |
| "completions/mean_length": 1192.1640625, | |
| "completions/min_length": 445.0, | |
| "epoch": 0.8197278911564626, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12208372574845405, | |
| "kl": 0.0014582149669877253, | |
| "learning_rate": 3.98193698272698e-08, | |
| "loss": 1.3541430234909058e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5893775820732117, | |
| "reward_std": 0.36501747369766235, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5893775820732117, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3986145853996277, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4604.0, | |
| "completions/mean_length": 1244.73828125, | |
| "completions/min_length": 346.0, | |
| "epoch": 0.8231292517006803, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1272060823706791, | |
| "kl": 0.0013787021653115517, | |
| "learning_rate": 3.837024931039995e-08, | |
| "loss": 1.4100223779678345e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5549571514129639, | |
| "reward_std": 0.40352365374565125, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5549571514129639, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.407723993062973, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4099.0, | |
| "completions/mean_length": 1194.7421875, | |
| "completions/min_length": 467.0, | |
| "epoch": 0.826530612244898, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13157085650901407, | |
| "kl": 0.0014077305258979322, | |
| "learning_rate": 3.6945794086007705e-08, | |
| "loss": 1.2433156371116638e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6976893544197083, | |
| "reward_std": 0.2876189947128296, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.697689414024353, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.33244314789772034, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4483.0, | |
| "completions/mean_length": 1337.625, | |
| "completions/min_length": 521.0, | |
| "epoch": 0.8299319727891157, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1207531264113648, | |
| "kl": 0.0013017231822232134, | |
| "learning_rate": 3.5546170173306436e-08, | |
| "loss": 1.3485550880432129e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5120729207992554, | |
| "reward_std": 0.3624032437801361, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5120729207992554, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3639506995677948, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3035.0, | |
| "completions/mean_length": 1198.40625, | |
| "completions/min_length": 474.0, | |
| "epoch": 0.8333333333333334, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12806495998767994, | |
| "kl": 0.0014157043924569734, | |
| "learning_rate": 3.4171540697438355e-08, | |
| "loss": 1.3671815395355225e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5975139737129211, | |
| "reward_std": 0.37816518545150757, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5975139737129211, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39434632658958435, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1249.2265625, | |
| "completions/min_length": 507.0, | |
| "epoch": 0.8367346938775511, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12522900942275697, | |
| "kl": 0.0013556128133132006, | |
| "learning_rate": 3.2822065870462215e-08, | |
| "loss": 1.4379620552062988e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4996965825557709, | |
| "reward_std": 0.4283973276615143, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.49969661235809326, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.44312095642089844, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3184.0, | |
| "completions/mean_length": 1268.6171875, | |
| "completions/min_length": 536.0, | |
| "epoch": 0.8401360544217688, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12919194681083224, | |
| "kl": 0.0012587565233843634, | |
| "learning_rate": 3.149790297268107e-08, | |
| "loss": 1.3522803783416748e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5647189021110535, | |
| "reward_std": 0.40422070026397705, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5647189021110535, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42468276619911194, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2953.0, | |
| "completions/mean_length": 1125.83203125, | |
| "completions/min_length": 508.0, | |
| "epoch": 0.8435374149659864, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12836800954696548, | |
| "kl": 0.0014017259263710002, | |
| "learning_rate": 3.0199206334310945e-08, | |
| "loss": 1.298263669013977e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6107585430145264, | |
| "reward_std": 0.3228145241737366, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6107584834098816, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3467032015323639, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4479.0, | |
| "completions/mean_length": 1233.171875, | |
| "completions/min_length": 384.0, | |
| "epoch": 0.8469387755102041, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12577691381845763, | |
| "kl": 0.0013851264275217545, | |
| "learning_rate": 2.892612731749414e-08, | |
| "loss": 1.2442469596862793e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5270353555679321, | |
| "reward_std": 0.40185391902923584, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5270353555679321, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4054703414440155, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2801.0, | |
| "completions/mean_length": 1164.140625, | |
| "completions/min_length": 448.0, | |
| "epoch": 0.8503401360544217, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13561042447797822, | |
| "kl": 0.0014222385561879491, | |
| "learning_rate": 2.7678814298657732e-08, | |
| "loss": 1.475214958190918e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5727814435958862, | |
| "reward_std": 0.2735365629196167, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5727814435958862, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3202778697013855, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4152.0, | |
| "completions/mean_length": 1262.21875, | |
| "completions/min_length": 469.0, | |
| "epoch": 0.8537414965986394, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1271199876604987, | |
| "kl": 0.0013093487041260232, | |
| "learning_rate": 2.6457412651220895e-08, | |
| "loss": 1.4081597328186035e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5430048704147339, | |
| "reward_std": 0.38867905735969543, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5430048704147339, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.41183096170425415, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4171.0, | |
| "completions/mean_length": 1299.67578125, | |
| "completions/min_length": 449.0, | |
| "epoch": 0.8571428571428571, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1304441357703922, | |
| "kl": 0.0013141680137778167, | |
| "learning_rate": 2.5262064728651194e-08, | |
| "loss": 1.3150274753570557e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5142631530761719, | |
| "reward_std": 0.3517936170101166, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5142631530761719, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.36099955439567566, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3958.0, | |
| "completions/mean_length": 1306.7109375, | |
| "completions/min_length": 468.0, | |
| "epoch": 0.8605442176870748, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1316901048192095, | |
| "kl": 0.0013079443942842772, | |
| "learning_rate": 2.409290984787371e-08, | |
| "loss": 1.3560056686401367e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5310392379760742, | |
| "reward_std": 0.3759360909461975, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5310392379760742, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3867994248867035, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2512.0, | |
| "completions/mean_length": 1135.703125, | |
| "completions/min_length": 434.0, | |
| "epoch": 0.8639455782312925, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1231556725021594, | |
| "kl": 0.001387271031489945, | |
| "learning_rate": 2.2950084273033633e-08, | |
| "loss": 1.4193356037139893e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6066977977752686, | |
| "reward_std": 0.2876471281051636, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6066977977752686, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.31261515617370605, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3405.0, | |
| "completions/mean_length": 1179.4453125, | |
| "completions/min_length": 461.0, | |
| "epoch": 0.8673469387755102, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13542018475020984, | |
| "kl": 0.0013846461633875151, | |
| "learning_rate": 2.183372119961499e-08, | |
| "loss": 1.3578683137893677e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4826492369174957, | |
| "reward_std": 0.37893763184547424, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4826492369174957, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3863247334957123, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3682.0, | |
| "completions/mean_length": 1303.90625, | |
| "completions/min_length": 515.0, | |
| "epoch": 0.8707482993197279, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12642203725075352, | |
| "kl": 0.0013932072433817666, | |
| "learning_rate": 2.074395073891644e-08, | |
| "loss": 1.3336539268493652e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5473675727844238, | |
| "reward_std": 0.3467334508895874, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5473675727844238, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3749827444553375, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3564.0, | |
| "completions/mean_length": 1240.109375, | |
| "completions/min_length": 477.0, | |
| "epoch": 0.8741496598639455, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1325814674676704, | |
| "kl": 0.0013994931805427768, | |
| "learning_rate": 1.9680899902887266e-08, | |
| "loss": 1.4491379261016846e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6287966966629028, | |
| "reward_std": 0.2795559763908386, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6287966966629028, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3080456852912903, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4612.0, | |
| "completions/mean_length": 1310.80078125, | |
| "completions/min_length": 479.0, | |
| "epoch": 0.8775510204081632, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1273096835305895, | |
| "kl": 0.0012589961306730402, | |
| "learning_rate": 1.8644692589323967e-08, | |
| "loss": 1.2870877981185913e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4979984760284424, | |
| "reward_std": 0.4460023045539856, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4979984760284424, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4358255863189697, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3290.0, | |
| "completions/mean_length": 1306.72265625, | |
| "completions/min_length": 463.0, | |
| "epoch": 0.8809523809523809, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12743739498262194, | |
| "kl": 0.0013438629566735472, | |
| "learning_rate": 1.7635449567430183e-08, | |
| "loss": 1.2833625078201294e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5485579967498779, | |
| "reward_std": 0.3817247748374939, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5485579967498779, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3858395516872406, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3359.0, | |
| "completions/mean_length": 1184.99609375, | |
| "completions/min_length": 479.0, | |
| "epoch": 0.8843537414965986, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1264194885841443, | |
| "kl": 0.0014367117000801954, | |
| "learning_rate": 1.6653288463741062e-08, | |
| "loss": 1.5553086996078491e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5498301982879639, | |
| "reward_std": 0.376219242811203, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5498301982879639, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39297208189964294, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3728.0, | |
| "completions/mean_length": 1216.14453125, | |
| "completions/min_length": 491.0, | |
| "epoch": 0.8877551020408163, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12428839600576591, | |
| "kl": 0.0013569434822784388, | |
| "learning_rate": 1.5698323748414122e-08, | |
| "loss": 1.3709068298339844e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6100364923477173, | |
| "reward_std": 0.3211628198623657, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6100364923477173, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3399140238761902, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2995.0, | |
| "completions/mean_length": 1211.3984375, | |
| "completions/min_length": 445.0, | |
| "epoch": 0.891156462585034, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13142807112400828, | |
| "kl": 0.001364831018690893, | |
| "learning_rate": 1.4770666721887621e-08, | |
| "loss": 1.3671815395355225e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.431456983089447, | |
| "reward_std": 0.4261000454425812, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.431456983089447, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42020177841186523, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3240.0, | |
| "completions/mean_length": 1270.296875, | |
| "completions/min_length": 494.0, | |
| "epoch": 0.8945578231292517, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12432611658097836, | |
| "kl": 0.0013498828893716563, | |
| "learning_rate": 1.3870425501908672e-08, | |
| "loss": 1.3872049748897552e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.3706500828266144, | |
| "reward_std": 0.4424039423465729, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.37065011262893677, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4383034110069275, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3257.0, | |
| "completions/mean_length": 1216.11328125, | |
| "completions/min_length": 497.0, | |
| "epoch": 0.8979591836734694, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12923448726401368, | |
| "kl": 0.001308337137743365, | |
| "learning_rate": 1.2997705010932391e-08, | |
| "loss": 1.3634562492370605e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.583458662033081, | |
| "reward_std": 0.4207555055618286, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5834586024284363, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4383680522441864, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3966.0, | |
| "completions/mean_length": 1267.04296875, | |
| "completions/min_length": 462.0, | |
| "epoch": 0.9013605442176871, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12273271242830487, | |
| "kl": 0.0013095537397020962, | |
| "learning_rate": 1.2152606963892863e-08, | |
| "loss": 1.298263669013977e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5434870719909668, | |
| "reward_std": 0.3962419033050537, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5434870719909668, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.40210050344467163, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2793.0, | |
| "completions/mean_length": 1221.6484375, | |
| "completions/min_length": 498.0, | |
| "epoch": 0.9047619047619048, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1373786488327759, | |
| "kl": 0.0013954197893326636, | |
| "learning_rate": 1.1335229856348689e-08, | |
| "loss": 1.4295801520347595e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4984479248523712, | |
| "reward_std": 0.336830735206604, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4984479546546936, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.377440869808197, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4732.0, | |
| "completions/mean_length": 1256.984375, | |
| "completions/min_length": 511.0, | |
| "epoch": 0.9081632653061225, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13037104352607035, | |
| "kl": 0.0012341619449216523, | |
| "learning_rate": 1.054566895300324e-08, | |
| "loss": 1.2405216693878174e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.33690741658210754, | |
| "reward_std": 0.4263767600059509, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.33690741658210754, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43096432089805603, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2919.0, | |
| "completions/mean_length": 1166.61328125, | |
| "completions/min_length": 383.0, | |
| "epoch": 0.9115646258503401, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13027755451241033, | |
| "kl": 0.0013724857608394814, | |
| "learning_rate": 9.784016276601609e-09, | |
| "loss": 1.3690441846847534e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5407834053039551, | |
| "reward_std": 0.3101251721382141, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5407834053039551, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.34196120500564575, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2649.0, | |
| "completions/mean_length": 1170.43359375, | |
| "completions/min_length": 470.0, | |
| "epoch": 0.9149659863945578, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12799678385136565, | |
| "kl": 0.0014380482898559421, | |
| "learning_rate": 9.050360597205513e-09, | |
| "loss": 1.475214958190918e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6152033805847168, | |
| "reward_std": 0.3536611795425415, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6152033805847168, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3689250349998474, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4481.0, | |
| "completions/mean_length": 1223.59375, | |
| "completions/min_length": 460.0, | |
| "epoch": 0.9183673469387755, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1286436267395403, | |
| "kl": 0.0014034728528713458, | |
| "learning_rate": 8.344787421847216e-09, | |
| "loss": 1.4491379261016846e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5233062505722046, | |
| "reward_std": 0.37922120094299316, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5233062505722046, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4167995750904083, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3936.0, | |
| "completions/mean_length": 1185.27734375, | |
| "completions/min_length": 409.0, | |
| "epoch": 0.9217687074829932, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13565037787124126, | |
| "kl": 0.0014686701588288997, | |
| "learning_rate": 7.667378984563599e-09, | |
| "loss": 1.4975666999816895e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6511124968528748, | |
| "reward_std": 0.3759189546108246, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6511124968528748, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.37731456756591797, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3347.0, | |
| "completions/mean_length": 1224.71875, | |
| "completions/min_length": 511.0, | |
| "epoch": 0.9251700680272109, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13122739068113443, | |
| "kl": 0.0014554602657881333, | |
| "learning_rate": 7.018214236812009e-09, | |
| "loss": 1.4603137969970703e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6520252823829651, | |
| "reward_std": 0.3065425753593445, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6520252823829651, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.336844801902771, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1329.84765625, | |
| "completions/min_length": 408.0, | |
| "epoch": 0.9285714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12969250322344092, | |
| "kl": 0.0013099484158374253, | |
| "learning_rate": 6.397368838268496e-09, | |
| "loss": 1.3224780559539795e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4384145736694336, | |
| "reward_std": 0.4246937334537506, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4384145736694336, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43505164980888367, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3074.0, | |
| "completions/mean_length": 1172.4453125, | |
| "completions/min_length": 414.0, | |
| "epoch": 0.9319727891156463, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12746256958452673, | |
| "kl": 0.0015114195266505703, | |
| "learning_rate": 5.80491514800957e-09, | |
| "loss": 1.543201506137848e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5098080635070801, | |
| "reward_std": 0.4241393804550171, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5098081827163696, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.43560436367988586, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3518.0, | |
| "completions/mean_length": 1221.8046875, | |
| "completions/min_length": 476.0, | |
| "epoch": 0.935374149659864, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12529544378366023, | |
| "kl": 0.001437538394384319, | |
| "learning_rate": 5.24092221607908e-09, | |
| "loss": 1.4826655387878418e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.4893878102302551, | |
| "reward_std": 0.40568646788597107, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.4893878400325775, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4118155241012573, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6452.0, | |
| "completions/mean_length": 1276.9453125, | |
| "completions/min_length": 459.0, | |
| "epoch": 0.9387755102040817, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13384851249165047, | |
| "kl": 0.0013293683077790774, | |
| "learning_rate": 4.705455775440237e-09, | |
| "loss": 1.1986121535301208e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.47771668434143066, | |
| "reward_std": 0.40792709589004517, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.47771668434143066, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42754605412483215, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3401.0, | |
| "completions/mean_length": 1260.4296875, | |
| "completions/min_length": 402.0, | |
| "epoch": 0.9421768707482994, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12174752100098382, | |
| "kl": 0.0013493520209522103, | |
| "learning_rate": 4.198578234314604e-09, | |
| "loss": 1.2908130884170532e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5614825487136841, | |
| "reward_std": 0.33593881130218506, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5614825487136841, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3463003933429718, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4036.0, | |
| "completions/mean_length": 1229.71875, | |
| "completions/min_length": 502.0, | |
| "epoch": 0.9455782312925171, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12422464239414316, | |
| "kl": 0.0013207711699578795, | |
| "learning_rate": 3.720348668908385e-09, | |
| "loss": 1.4547258615493774e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6281092762947083, | |
| "reward_std": 0.35857096314430237, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.628109335899353, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.35941147804260254, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2924.0, | |
| "completions/mean_length": 1186.828125, | |
| "completions/min_length": 534.0, | |
| "epoch": 0.9489795918367347, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12632069336817253, | |
| "kl": 0.0014034768064448144, | |
| "learning_rate": 3.2708228165273244e-09, | |
| "loss": 1.2293457984924316e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5960803031921387, | |
| "reward_std": 0.29391491413116455, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5960803031921387, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3531346917152405, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 8193.0, | |
| "completions/mean_length": 1162.16015625, | |
| "completions/min_length": 452.0, | |
| "epoch": 0.9523809523809523, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12842830063820992, | |
| "kl": 0.001470525316108251, | |
| "learning_rate": 2.850053069080344e-09, | |
| "loss": 1.4062970876693726e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5352786779403687, | |
| "reward_std": 0.3336600661277771, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5352786779403687, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3443100154399872, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3791.0, | |
| "completions/mean_length": 1248.890625, | |
| "completions/min_length": 546.0, | |
| "epoch": 0.95578231292517, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12484503942889262, | |
| "kl": 0.0013901375514251413, | |
| "learning_rate": 2.458088466973346e-09, | |
| "loss": 1.3709068298339844e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5107653141021729, | |
| "reward_std": 0.38533347845077515, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5107653141021729, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3988993465900421, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3588.0, | |
| "completions/mean_length": 1303.5703125, | |
| "completions/min_length": 506.0, | |
| "epoch": 0.9591836734693877, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12427816786444691, | |
| "kl": 0.0013491953013726743, | |
| "learning_rate": 2.094974693393731e-09, | |
| "loss": 1.3280659914016724e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5057296752929688, | |
| "reward_std": 0.35366684198379517, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.505729615688324, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3649117350578308, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3250.0, | |
| "completions/mean_length": 1188.65625, | |
| "completions/min_length": 475.0, | |
| "epoch": 0.9625850340136054, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12292104991319937, | |
| "kl": 0.0013688728076886036, | |
| "learning_rate": 1.7607540689859035e-09, | |
| "loss": 1.3783574104309082e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.47922593355178833, | |
| "reward_std": 0.37873101234436035, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.47922593355178833, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.39812812209129333, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2948.0, | |
| "completions/mean_length": 1189.20703125, | |
| "completions/min_length": 475.0, | |
| "epoch": 0.9659863945578231, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12510495565401736, | |
| "kl": 0.0013413819378911285, | |
| "learning_rate": 1.4554655469189437e-09, | |
| "loss": 1.3094395399093628e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.37418410181999207, | |
| "reward_std": 0.4322727918624878, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.37418413162231445, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.42694520950317383, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3314.0, | |
| "completions/mean_length": 1233.1796875, | |
| "completions/min_length": 434.0, | |
| "epoch": 0.9693877551020408, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12785761769933177, | |
| "kl": 0.0013754392584814923, | |
| "learning_rate": 1.1791447083465133e-09, | |
| "loss": 1.3709068298339844e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5906023383140564, | |
| "reward_std": 0.27952495217323303, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5906022787094116, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.29190781712532043, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2930.0, | |
| "completions/mean_length": 1205.79296875, | |
| "completions/min_length": 426.0, | |
| "epoch": 0.9727891156462585, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1261298576091256, | |
| "kl": 0.001388487350595824, | |
| "learning_rate": 9.318237582600086e-10, | |
| "loss": 1.4435499906539917e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6461822986602783, | |
| "reward_std": 0.32930630445480347, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6461822986602783, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3550751507282257, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3610.0, | |
| "completions/mean_length": 1332.69140625, | |
| "completions/min_length": 566.0, | |
| "epoch": 0.9761904761904762, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1240331232588916, | |
| "kl": 0.0012782484163835761, | |
| "learning_rate": 7.135315217350891e-10, | |
| "loss": 1.3243407011032104e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6077274084091187, | |
| "reward_std": 0.29934900999069214, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6077274084091187, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.32556846737861633, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5343.0, | |
| "completions/mean_length": 1273.13671875, | |
| "completions/min_length": 515.0, | |
| "epoch": 0.9795918367346939, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1288140416013684, | |
| "kl": 0.0013770640471193474, | |
| "learning_rate": 5.242934405720878e-10, | |
| "loss": 1.5227124094963074e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.552533745765686, | |
| "reward_std": 0.37712985277175903, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.552533745765686, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3899850845336914, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2891.0, | |
| "completions/mean_length": 1344.17578125, | |
| "completions/min_length": 579.0, | |
| "epoch": 0.9829931972789115, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12675166680813696, | |
| "kl": 0.001224989002366783, | |
| "learning_rate": 3.6413157033077234e-10, | |
| "loss": 1.2069940567016602e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.48343604803085327, | |
| "reward_std": 0.3913576304912567, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.48343604803085327, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3972838819026947, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3515.0, | |
| "completions/mean_length": 1218.1875, | |
| "completions/min_length": 494.0, | |
| "epoch": 0.9863945578231292, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.12321595926218833, | |
| "kl": 0.0013713659063796513, | |
| "learning_rate": 2.3306457775981727e-10, | |
| "loss": 1.3820827007293701e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5790843963623047, | |
| "reward_std": 0.24808508157730103, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5790843963623047, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.27677780389785767, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4311.0, | |
| "completions/mean_length": 1273.83203125, | |
| "completions/min_length": 515.0, | |
| "epoch": 0.9897959183673469, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1313147159889535, | |
| "kl": 0.0014206118667061673, | |
| "learning_rate": 1.3110773862126667e-10, | |
| "loss": 1.4603137969970703e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5936909914016724, | |
| "reward_std": 0.3352019786834717, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5936910510063171, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.34831205010414124, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3531.0, | |
| "completions/mean_length": 1268.19921875, | |
| "completions/min_length": 497.0, | |
| "epoch": 0.9931972789115646, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.125741452166242, | |
| "kl": 0.0013244858291727724, | |
| "learning_rate": 5.827293591006976e-11, | |
| "loss": 1.3262033462524414e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.41567426919937134, | |
| "reward_std": 0.41647863388061523, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.41567426919937134, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.4198899269104004, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2584.0, | |
| "completions/mean_length": 1112.0234375, | |
| "completions/min_length": 421.0, | |
| "epoch": 0.9965986394557823, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.13234971889912275, | |
| "kl": 0.0014827251252427232, | |
| "learning_rate": 1.456865846913291e-11, | |
| "loss": 1.3709068298339844e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.6423551440238953, | |
| "reward_std": 0.35404038429260254, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.6423551440238953, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.3747543692588806, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3105.0, | |
| "completions/mean_length": 1199.40234375, | |
| "completions/min_length": 494.0, | |
| "epoch": 1.0, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1285475179466372, | |
| "kl": 0.0014067331903788727, | |
| "learning_rate": 0.0, | |
| "loss": 1.391395926475525e-06, | |
| "num_turns": 2.0, | |
| "reward": 0.5415540933609009, | |
| "reward_std": 0.35335230827331543, | |
| "rewards/MLPCodeOnPolicy32BORM/mean": 0.5415540933609009, | |
| "rewards/MLPCodeOnPolicy32BORM/std": 0.374896377325058, | |
| "step": 294 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 294, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |