diff --git "a/checkpoint-500/trainer_state.json" "b/checkpoint-500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-500/trainer_state.json" @@ -0,0 +1,16034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 282.3125, + "completions/mean_terminated_length": 282.3125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.34285064321011305, + "epoch": 0.002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6038885116577148, + "learning_rate": 0.0, + "loss": 0.1215, + "num_tokens": 9066.0, + "reward": 2.776546001434326, + "reward_std": 3.6707262992858887, + "rewards/fitness_reward/mean": 2.6902005672454834, + "rewards/fitness_reward/std": 3.9962387084960938, + "rewards/kidney_reward/mean": -0.014779508113861084, + "rewards/kidney_reward/std": 0.944055438041687, + "rewards/length2tails_reward/mean": 0.3619094491004944, + "rewards/length2tails_reward/std": 0.47263848781585693, + "rewards/thermo_reward/mean": 0.006515428423881531, + "rewards/thermo_reward/std": 1.582302212715149, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 265.3125, + "completions/mean_terminated_length": 265.3125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.25550550501793623, + "epoch": 0.004, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1680359840393066, + "learning_rate": 4e-08, + "loss": -0.0098, + "num_tokens": 17588.0, + "reward": 2.6952362060546875, + "reward_std": 3.8378028869628906, + "rewards/fitness_reward/mean": 2.763430118560791, + "rewards/fitness_reward/std": 4.361652374267578, + "rewards/kidney_reward/mean": -0.05466890335083008, + "rewards/kidney_reward/std": 1.101677656173706, + "rewards/length2tails_reward/mean": 0.49158233404159546, + "rewards/length2tails_reward/std": 0.4806991517543793, + "rewards/thermo_reward/mean": -0.3275108337402344, + "rewards/thermo_reward/std": 1.8423668146133423, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 269.9375, + "completions/mean_terminated_length": 269.9375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.30424352758564055, + "epoch": 0.006, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1832661628723145, + "learning_rate": 8e-08, + "loss": 0.0287, + "num_tokens": 26258.0, + "reward": 2.8955960273742676, + "reward_std": 4.059200763702393, + "rewards/fitness_reward/mean": 3.0105772018432617, + "rewards/fitness_reward/std": 4.357508659362793, + "rewards/kidney_reward/mean": -0.23322094976902008, + "rewards/kidney_reward/std": 0.9928128719329834, + "rewards/length2tails_reward/mean": 0.4026995301246643, + "rewards/length2tails_reward/std": 0.4532533884048462, + "rewards/thermo_reward/mean": -0.19809073209762573, + "rewards/thermo_reward/std": 1.4632333517074585, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 275.1875, + "completions/mean_terminated_length": 275.1875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.28253951808437705, + "epoch": 0.008, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9522150754928589, + "learning_rate": 1.2e-07, + "loss": 0.0271, + "num_tokens": 35096.0, + "reward": 1.998507022857666, + "reward_std": 3.9490599632263184, + "rewards/fitness_reward/mean": 2.1376171112060547, + "rewards/fitness_reward/std": 4.451599597930908, + "rewards/kidney_reward/mean": -0.14740341901779175, + "rewards/kidney_reward/std": 1.0896557569503784, + "rewards/length2tails_reward/mean": 0.4439179301261902, + "rewards/length2tails_reward/std": 0.4727405905723572, + "rewards/thermo_reward/mean": -0.3527754545211792, + "rewards/thermo_reward/std": 1.5906970500946045, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 258.15625, + "completions/mean_terminated_length": 258.15625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.14575099013745785, + "epoch": 0.01, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8847610950469971, + "learning_rate": 1.6e-07, + "loss": -0.0549, + "num_tokens": 43389.0, + "reward": 3.631153106689453, + "reward_std": 3.643018960952759, + "rewards/fitness_reward/mean": 3.822523593902588, + "rewards/fitness_reward/std": 3.948245048522949, + "rewards/kidney_reward/mean": -0.1129472404718399, + "rewards/kidney_reward/std": 1.0505248308181763, + "rewards/length2tails_reward/mean": 0.4005865156650543, + "rewards/length2tails_reward/std": 0.44541364908218384, + "rewards/thermo_reward/mean": -0.4700867533683777, + "rewards/thermo_reward/std": 1.7214210033416748, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.30541147477924824, + "epoch": 0.012, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3191102743148804, + "learning_rate": 2e-07, + "loss": 0.0231, + "num_tokens": 52123.0, + "reward": 3.4447648525238037, + "reward_std": 3.701854944229126, + "rewards/fitness_reward/mean": 3.469925880432129, + "rewards/fitness_reward/std": 3.4349849224090576, + "rewards/kidney_reward/mean": 0.07564692199230194, + "rewards/kidney_reward/std": 1.061210036277771, + "rewards/length2tails_reward/mean": 0.3852400779724121, + "rewards/length2tails_reward/std": 0.46152031421661377, + "rewards/thermo_reward/mean": -0.3185890316963196, + "rewards/thermo_reward/std": 1.1925057172775269, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 266.53125, + "completions/mean_terminated_length": 266.53125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.17479356518015265, + "epoch": 0.014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453187346458435, + "learning_rate": 2.4e-07, + "loss": -0.0039, + "num_tokens": 60684.0, + "reward": 4.399518013000488, + "reward_std": 3.6989517211914062, + "rewards/fitness_reward/mean": 4.309534072875977, + "rewards/fitness_reward/std": 3.625988483428955, + "rewards/kidney_reward/mean": 0.16553248465061188, + "rewards/kidney_reward/std": 1.1728154420852661, + "rewards/length2tails_reward/mean": 0.41060203313827515, + "rewards/length2tails_reward/std": 0.45657584071159363, + "rewards/thermo_reward/mean": -0.19086632132530212, + "rewards/thermo_reward/std": 1.5577343702316284, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 278.0, + "completions/mean_terminated_length": 278.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.2710487926378846, + "epoch": 0.016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3498140573501587, + "learning_rate": 2.8e-07, + "loss": 0.0108, + "num_tokens": 69612.0, + "reward": 2.644428253173828, + "reward_std": 4.3834662437438965, + "rewards/fitness_reward/mean": 2.7699179649353027, + "rewards/fitness_reward/std": 4.351969242095947, + "rewards/kidney_reward/mean": -0.1517922580242157, + "rewards/kidney_reward/std": 1.004507303237915, + "rewards/length2tails_reward/mean": 0.4273234009742737, + "rewards/length2tails_reward/std": 0.4685462415218353, + "rewards/thermo_reward/mean": -0.31284886598587036, + "rewards/thermo_reward/std": 1.7761188745498657, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 284.0, + "completions/mean_terminated_length": 284.0, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.16439750418066978, + "epoch": 0.018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4144439697265625, + "learning_rate": 3.2e-07, + "loss": 0.1057, + "num_tokens": 78732.0, + "reward": 2.6186068058013916, + "reward_std": 3.714634895324707, + "rewards/fitness_reward/mean": 2.789903163909912, + "rewards/fitness_reward/std": 4.058469295501709, + "rewards/kidney_reward/mean": -0.16366836428642273, + "rewards/kidney_reward/std": 1.120846152305603, + "rewards/length2tails_reward/mean": 0.4343424439430237, + "rewards/length2tails_reward/std": 0.47384053468704224, + "rewards/thermo_reward/mean": -0.3960953652858734, + "rewards/thermo_reward/std": 1.533713459968567, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 263.21875, + "completions/mean_terminated_length": 263.21875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.213692031102255, + "epoch": 0.02, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2669860124588013, + "learning_rate": 3.6e-07, + "loss": -0.0059, + "num_tokens": 87187.0, + "reward": 2.8165817260742188, + "reward_std": 3.6723098754882812, + "rewards/fitness_reward/mean": 3.185605049133301, + "rewards/fitness_reward/std": 4.036534309387207, + "rewards/kidney_reward/mean": -0.2615082561969757, + "rewards/kidney_reward/std": 0.9169155359268188, + "rewards/length2tails_reward/mean": 0.3918038606643677, + "rewards/length2tails_reward/std": 0.46539491415023804, + "rewards/thermo_reward/mean": -0.6724401712417603, + "rewards/thermo_reward/std": 1.888286828994751, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 276.03125, + "completions/mean_terminated_length": 276.03125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.1332897578831762, + "epoch": 0.022, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8590123057365417, + "learning_rate": 4e-07, + "loss": 0.0672, + "num_tokens": 96052.0, + "reward": 2.619511127471924, + "reward_std": 3.0328433513641357, + "rewards/fitness_reward/mean": 2.949721336364746, + "rewards/fitness_reward/std": 3.4479706287384033, + "rewards/kidney_reward/mean": -0.3590899705886841, + "rewards/kidney_reward/std": 0.6021451950073242, + "rewards/length2tails_reward/mean": 0.24954092502593994, + "rewards/length2tails_reward/std": 0.4148816764354706, + "rewards/thermo_reward/mean": -0.42610087990760803, + "rewards/thermo_reward/std": 1.485501766204834, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 294.0, + "completions/mean_terminated_length": 294.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.2908358834683895, + "epoch": 0.024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5993109941482544, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.1808, + "num_tokens": 105492.0, + "reward": 2.504091501235962, + "reward_std": 3.4993772506713867, + "rewards/fitness_reward/mean": 2.8465797901153564, + "rewards/fitness_reward/std": 4.194863319396973, + "rewards/kidney_reward/mean": -0.10735474526882172, + "rewards/kidney_reward/std": 1.0824220180511475, + "rewards/length2tails_reward/mean": 0.41566556692123413, + "rewards/length2tails_reward/std": 0.4747588336467743, + "rewards/thermo_reward/mean": -0.7854543328285217, + "rewards/thermo_reward/std": 1.695159912109375, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 265.0625, + "completions/mean_terminated_length": 265.0625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.2911716205999255, + "epoch": 0.026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40619695186615, + "learning_rate": 4.8e-07, + "loss": 0.004, + "num_tokens": 114006.0, + "reward": 1.9322597980499268, + "reward_std": 4.115525722503662, + "rewards/fitness_reward/mean": 2.053403615951538, + "rewards/fitness_reward/std": 4.407435417175293, + "rewards/kidney_reward/mean": -0.31295573711395264, + "rewards/kidney_reward/std": 0.8682083487510681, + "rewards/length2tails_reward/mean": 0.48870545625686646, + "rewards/length2tails_reward/std": 0.49721574783325195, + "rewards/thermo_reward/mean": -0.17368489503860474, + "rewards/thermo_reward/std": 2.0285637378692627, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 279.15625, + "completions/mean_terminated_length": 279.15625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.26627546083182096, + "epoch": 0.028, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0304509401321411, + "learning_rate": 5.2e-07, + "loss": 0.0939, + "num_tokens": 122971.0, + "reward": 2.5467944145202637, + "reward_std": 3.6509149074554443, + "rewards/fitness_reward/mean": 2.655552387237549, + "rewards/fitness_reward/std": 4.0566606521606445, + "rewards/kidney_reward/mean": 0.1742018312215805, + "rewards/kidney_reward/std": 1.108739972114563, + "rewards/length2tails_reward/mean": 0.346333771944046, + "rewards/length2tails_reward/std": 0.43246445059776306, + "rewards/thermo_reward/mean": -0.564885139465332, + "rewards/thermo_reward/std": 1.453627109527588, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.09859434771351516, + "epoch": 0.03, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0298631191253662, + "learning_rate": 5.6e-07, + "loss": 0.0469, + "num_tokens": 131707.0, + "reward": 3.6903483867645264, + "reward_std": 3.1678555011749268, + "rewards/fitness_reward/mean": 3.889268398284912, + "rewards/fitness_reward/std": 3.267036199569702, + "rewards/kidney_reward/mean": -0.27444595098495483, + "rewards/kidney_reward/std": 0.8988531231880188, + "rewards/length2tails_reward/mean": 0.25988560914993286, + "rewards/length2tails_reward/std": 0.39783328771591187, + "rewards/thermo_reward/mean": -0.25333669781684875, + "rewards/thermo_reward/std": 1.2771704196929932, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 267.28125, + "completions/mean_terminated_length": 267.28125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.19521328411065042, + "epoch": 0.032, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1851061582565308, + "learning_rate": 6e-07, + "loss": 0.0212, + "num_tokens": 140292.0, + "reward": 2.5893378257751465, + "reward_std": 3.584918737411499, + "rewards/fitness_reward/mean": 2.814847946166992, + "rewards/fitness_reward/std": 3.982393741607666, + "rewards/kidney_reward/mean": -0.11568474769592285, + "rewards/kidney_reward/std": 0.9866483807563782, + "rewards/length2tails_reward/mean": 0.3492621183395386, + "rewards/length2tails_reward/std": 0.45565006136894226, + "rewards/thermo_reward/mean": -0.5099663138389587, + "rewards/thermo_reward/std": 1.4580680131912231, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 282.5625, + "completions/mean_terminated_length": 282.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.28495415579527617, + "epoch": 0.034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2181986570358276, + "learning_rate": 6.4e-07, + "loss": 0.0974, + "num_tokens": 149366.0, + "reward": 2.808767795562744, + "reward_std": 3.643484115600586, + "rewards/fitness_reward/mean": 3.0713696479797363, + "rewards/fitness_reward/std": 3.9968082904815674, + "rewards/kidney_reward/mean": -0.382036954164505, + "rewards/kidney_reward/std": 0.8268358707427979, + "rewards/length2tails_reward/mean": 0.5024834871292114, + "rewards/length2tails_reward/std": 0.4791508615016937, + "rewards/thermo_reward/mean": -0.3944079279899597, + "rewards/thermo_reward/std": 1.9147756099700928, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.17844699136912823, + "epoch": 0.036, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2913175821304321, + "learning_rate": 6.800000000000001e-07, + "loss": 0.072, + "num_tokens": 158292.0, + "reward": 2.8846983909606934, + "reward_std": 3.638531446456909, + "rewards/fitness_reward/mean": 3.058784008026123, + "rewards/fitness_reward/std": 3.7510721683502197, + "rewards/kidney_reward/mean": -0.2737460136413574, + "rewards/kidney_reward/std": 0.8914804458618164, + "rewards/length2tails_reward/mean": 0.34188228845596313, + "rewards/length2tails_reward/std": 0.4734570384025574, + "rewards/thermo_reward/mean": -0.24536648392677307, + "rewards/thermo_reward/std": 1.4068657159805298, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 297.625, + "completions/mean_terminated_length": 297.625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.19913436053320765, + "epoch": 0.038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7019699811935425, + "learning_rate": 7.2e-07, + "loss": 0.1602, + "num_tokens": 167848.0, + "reward": 2.3710553646087646, + "reward_std": 4.5642547607421875, + "rewards/fitness_reward/mean": 2.7442526817321777, + "rewards/fitness_reward/std": 4.607006072998047, + "rewards/kidney_reward/mean": -0.2314104288816452, + "rewards/kidney_reward/std": 0.9755641222000122, + "rewards/length2tails_reward/mean": 0.4541005790233612, + "rewards/length2tails_reward/std": 0.48273563385009766, + "rewards/thermo_reward/mean": -0.7420345544815063, + "rewards/thermo_reward/std": 1.7983129024505615, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.10643508494831622, + "epoch": 0.04, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2317397594451904, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0675, + "num_tokens": 176656.0, + "reward": 3.6184439659118652, + "reward_std": 3.254441738128662, + "rewards/fitness_reward/mean": 4.041838645935059, + "rewards/fitness_reward/std": 3.4221901893615723, + "rewards/kidney_reward/mean": -0.1963132917881012, + "rewards/kidney_reward/std": 0.9940592050552368, + "rewards/length2tails_reward/mean": 0.3017134666442871, + "rewards/length2tails_reward/std": 0.44021928310394287, + "rewards/thermo_reward/mean": -0.8013330698013306, + "rewards/thermo_reward/std": 1.6256381273269653, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 283.46875, + "completions/mean_terminated_length": 283.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1744343377649784, + "epoch": 0.042, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2164242267608643, + "learning_rate": 8e-07, + "loss": 0.1111, + "num_tokens": 185759.0, + "reward": 2.710228443145752, + "reward_std": 3.926434278488159, + "rewards/fitness_reward/mean": 2.9178555011749268, + "rewards/fitness_reward/std": 3.801013946533203, + "rewards/kidney_reward/mean": -0.17946933209896088, + "rewards/kidney_reward/std": 1.0358121395111084, + "rewards/length2tails_reward/mean": 0.2689725160598755, + "rewards/length2tails_reward/std": 0.4235849678516388, + "rewards/thermo_reward/mean": -0.3702709972858429, + "rewards/thermo_reward/std": 1.3275806903839111, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 263.4375, + "completions/mean_terminated_length": 263.4375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.35666807275265455, + "epoch": 0.044, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3615895509719849, + "learning_rate": 8.399999999999999e-07, + "loss": -0.0103, + "num_tokens": 194221.0, + "reward": 1.7372136116027832, + "reward_std": 4.236116409301758, + "rewards/fitness_reward/mean": 1.6128908395767212, + "rewards/fitness_reward/std": 4.648111343383789, + "rewards/kidney_reward/mean": 0.22530707716941833, + "rewards/kidney_reward/std": 0.9966040253639221, + "rewards/length2tails_reward/mean": 0.5139718055725098, + "rewards/length2tails_reward/std": 0.48198941349983215, + "rewards/thermo_reward/mean": -0.23364755511283875, + "rewards/thermo_reward/std": 1.8280906677246094, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 293.46875, + "completions/mean_terminated_length": 293.46875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.2110906399320811, + "epoch": 0.046, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1795854568481445, + "learning_rate": 8.799999999999999e-07, + "loss": 0.1542, + "num_tokens": 203644.0, + "reward": 2.4099345207214355, + "reward_std": 3.9786722660064697, + "rewards/fitness_reward/mean": 2.844266891479492, + "rewards/fitness_reward/std": 4.1898393630981445, + "rewards/kidney_reward/mean": -0.45548105239868164, + "rewards/kidney_reward/std": 0.7641519904136658, + "rewards/length2tails_reward/mean": 0.49857550859451294, + "rewards/length2tails_reward/std": 0.4670303165912628, + "rewards/thermo_reward/mean": -0.6624711751937866, + "rewards/thermo_reward/std": 1.6689636707305908, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.2916894480586052, + "epoch": 0.048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.159227967262268, + "learning_rate": 9.2e-07, + "loss": 0.0248, + "num_tokens": 212328.0, + "reward": 2.407467842102051, + "reward_std": 3.439573287963867, + "rewards/fitness_reward/mean": 2.5296783447265625, + "rewards/fitness_reward/std": 3.747447967529297, + "rewards/kidney_reward/mean": -0.2728431224822998, + "rewards/kidney_reward/std": 0.8738398551940918, + "rewards/length2tails_reward/mean": 0.3025975227355957, + "rewards/length2tails_reward/std": 0.44032588601112366, + "rewards/thermo_reward/mean": -0.1228766143321991, + "rewards/thermo_reward/std": 1.4348689317703247, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 265.75, + "completions/mean_terminated_length": 265.75, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.1331020297948271, + "epoch": 0.05, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2533458471298218, + "learning_rate": 9.6e-07, + "loss": -0.0039, + "num_tokens": 220864.0, + "reward": 3.805389881134033, + "reward_std": 3.130521774291992, + "rewards/fitness_reward/mean": 3.9648332595825195, + "rewards/fitness_reward/std": 3.3284666538238525, + "rewards/kidney_reward/mean": -0.11595182865858078, + "rewards/kidney_reward/std": 0.9243037104606628, + "rewards/length2tails_reward/mean": 0.3001132011413574, + "rewards/length2tails_reward/std": 0.4152694344520569, + "rewards/thermo_reward/mean": -0.3529908061027527, + "rewards/thermo_reward/std": 1.5167909860610962, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 267.125, + "completions/mean_terminated_length": 267.125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.16058688261546195, + "epoch": 0.052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6334291100502014, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 229444.0, + "reward": 3.3169748783111572, + "reward_std": 3.120516538619995, + "rewards/fitness_reward/mean": 3.4925124645233154, + "rewards/fitness_reward/std": 3.340493679046631, + "rewards/kidney_reward/mean": -0.30361461639404297, + "rewards/kidney_reward/std": 0.7561451196670532, + "rewards/length2tails_reward/mean": 0.3624529242515564, + "rewards/length2tails_reward/std": 0.4537730813026428, + "rewards/thermo_reward/mean": -0.22868666052818298, + "rewards/thermo_reward/std": 1.5443941354751587, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 266.5625, + "completions/mean_terminated_length": 266.5625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.12170495046302676, + "epoch": 0.054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9874021410942078, + "learning_rate": 1.04e-06, + "loss": -0.0104, + "num_tokens": 238006.0, + "reward": 3.653249979019165, + "reward_std": 3.816178560256958, + "rewards/fitness_reward/mean": 3.920667886734009, + "rewards/fitness_reward/std": 4.002020835876465, + "rewards/kidney_reward/mean": -0.01481878012418747, + "rewards/kidney_reward/std": 1.17020845413208, + "rewards/length2tails_reward/mean": 0.45274999737739563, + "rewards/length2tails_reward/std": 0.4590936601161957, + "rewards/thermo_reward/mean": -0.7463918924331665, + "rewards/thermo_reward/std": 1.5766205787658691, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 264.9375, + "completions/mean_terminated_length": 264.9375, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 0.1064134482294321, + "epoch": 0.056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6786694526672363, + "learning_rate": 1.08e-06, + "loss": -0.0297, + "num_tokens": 246516.0, + "reward": 3.909611701965332, + "reward_std": 3.2675232887268066, + "rewards/fitness_reward/mean": 4.002963066101074, + "rewards/fitness_reward/std": 3.269036054611206, + "rewards/kidney_reward/mean": 0.0017276406288146973, + "rewards/kidney_reward/std": 1.061722993850708, + "rewards/length2tails_reward/mean": 0.3930842876434326, + "rewards/length2tails_reward/std": 0.44041237235069275, + "rewards/thermo_reward/mean": -0.3849724531173706, + "rewards/thermo_reward/std": 1.5721300840377808, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 285.46875, + "completions/mean_terminated_length": 285.46875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.2716119010001421, + "epoch": 0.058, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.758548617362976, + "learning_rate": 1.12e-06, + "loss": 0.0978, + "num_tokens": 255683.0, + "reward": 2.6089797019958496, + "reward_std": 4.1650495529174805, + "rewards/fitness_reward/mean": 2.7181997299194336, + "rewards/fitness_reward/std": 4.425798416137695, + "rewards/kidney_reward/mean": -0.15508919954299927, + "rewards/kidney_reward/std": 0.9946020245552063, + "rewards/length2tails_reward/mean": 0.5060371160507202, + "rewards/length2tails_reward/std": 0.4584464728832245, + "rewards/thermo_reward/mean": -0.31636953353881836, + "rewards/thermo_reward/std": 1.4987841844558716, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 293.46875, + "completions/mean_terminated_length": 293.46875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.25898733153007925, + "epoch": 0.06, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0409340858459473, + "learning_rate": 1.16e-06, + "loss": 0.1339, + "num_tokens": 265106.0, + "reward": 1.6009509563446045, + "reward_std": 3.8154296875, + "rewards/fitness_reward/mean": 1.8939182758331299, + "rewards/fitness_reward/std": 4.212096691131592, + "rewards/kidney_reward/mean": -0.3592352867126465, + "rewards/kidney_reward/std": 0.9245181679725647, + "rewards/length2tails_reward/mean": 0.3944413661956787, + "rewards/length2tails_reward/std": 0.46978193521499634, + "rewards/thermo_reward/mean": -0.4239196479320526, + "rewards/thermo_reward/std": 1.8588840961456299, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 279.90625, + "completions/mean_terminated_length": 279.90625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12356520257890224, + "epoch": 0.062, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1687328815460205, + "learning_rate": 1.2e-06, + "loss": 0.0934, + "num_tokens": 274095.0, + "reward": 3.447176456451416, + "reward_std": 3.1346819400787354, + "rewards/fitness_reward/mean": 3.8587636947631836, + "rewards/fitness_reward/std": 3.590867280960083, + "rewards/kidney_reward/mean": -0.4081210494041443, + "rewards/kidney_reward/std": 0.8285349607467651, + "rewards/length2tails_reward/mean": 0.46461570262908936, + "rewards/length2tails_reward/std": 0.47861263155937195, + "rewards/thermo_reward/mean": -0.6473608613014221, + "rewards/thermo_reward/std": 1.6887931823730469, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 260.15625, + "completions/mean_terminated_length": 260.15625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.13539172057062387, + "epoch": 0.064, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3024259805679321, + "learning_rate": 1.24e-06, + "loss": -0.0461, + "num_tokens": 282452.0, + "reward": 2.7450294494628906, + "reward_std": 3.819532632827759, + "rewards/fitness_reward/mean": 2.8516581058502197, + "rewards/fitness_reward/std": 4.211026668548584, + "rewards/kidney_reward/mean": -0.03711742162704468, + "rewards/kidney_reward/std": 0.9770359396934509, + "rewards/length2tails_reward/mean": 0.3360474109649658, + "rewards/length2tails_reward/std": 0.44527024030685425, + "rewards/thermo_reward/mean": -0.3441632390022278, + "rewards/thermo_reward/std": 1.6381678581237793, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 266.96875, + "completions/mean_terminated_length": 266.96875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 0.21443682396784425, + "epoch": 0.066, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1235257387161255, + "learning_rate": 1.28e-06, + "loss": 0.0131, + "num_tokens": 291027.0, + "reward": 2.749849319458008, + "reward_std": 3.7275326251983643, + "rewards/fitness_reward/mean": 3.279467821121216, + "rewards/fitness_reward/std": 3.8192431926727295, + "rewards/kidney_reward/mean": -0.3075902462005615, + "rewards/kidney_reward/std": 1.0802315473556519, + "rewards/length2tails_reward/mean": 0.40814924240112305, + "rewards/length2tails_reward/std": 0.46687832474708557, + "rewards/thermo_reward/mean": -0.9557211995124817, + "rewards/thermo_reward/std": 1.525609016418457, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 272.46875, + "completions/mean_terminated_length": 272.46875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "entropy": 0.11994511261582375, + "epoch": 0.068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5663752555847168, + "learning_rate": 1.32e-06, + "loss": 0.0547, + "num_tokens": 299778.0, + "reward": 2.085150718688965, + "reward_std": 3.8342347145080566, + "rewards/fitness_reward/mean": 2.482593059539795, + "rewards/fitness_reward/std": 3.909547805786133, + "rewards/kidney_reward/mean": -0.08659157156944275, + "rewards/kidney_reward/std": 1.036144495010376, + "rewards/length2tails_reward/mean": 0.29886335134506226, + "rewards/length2tails_reward/std": 0.4395237863063812, + "rewards/thermo_reward/mean": -0.8577248454093933, + "rewards/thermo_reward/std": 1.518059492111206, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 286.9375, + "completions/mean_terminated_length": 286.9375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.1442709225229919, + "epoch": 0.07, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0670015811920166, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.1441, + "num_tokens": 308992.0, + "reward": 3.33197283744812, + "reward_std": 3.3288204669952393, + "rewards/fitness_reward/mean": 3.8933303356170654, + "rewards/fitness_reward/std": 3.8310139179229736, + "rewards/kidney_reward/mean": -0.3780807852745056, + "rewards/kidney_reward/std": 0.9036357998847961, + "rewards/length2tails_reward/mean": 0.39739546179771423, + "rewards/length2tails_reward/std": 0.4599010646343231, + "rewards/thermo_reward/mean": -0.9433316588401794, + "rewards/thermo_reward/std": 1.4738272428512573, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 274.46875, + "completions/mean_terminated_length": 274.46875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.09205999784171581, + "epoch": 0.072, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3518712520599365, + "learning_rate": 1.4e-06, + "loss": 0.0977, + "num_tokens": 317807.0, + "reward": 3.7158541679382324, + "reward_std": 3.447343349456787, + "rewards/fitness_reward/mean": 3.822787284851074, + "rewards/fitness_reward/std": 3.425808906555176, + "rewards/kidney_reward/mean": -0.19986845552921295, + "rewards/kidney_reward/std": 0.9525273442268372, + "rewards/length2tails_reward/mean": 0.289717435836792, + "rewards/length2tails_reward/std": 0.3901156783103943, + "rewards/thermo_reward/mean": -0.15885674953460693, + "rewards/thermo_reward/std": 1.5303553342819214, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 262.0, + "completions/mean_terminated_length": 262.0, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.23849095264449716, + "epoch": 0.074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8961380124092102, + "learning_rate": 1.44e-06, + "loss": -0.0484, + "num_tokens": 326223.0, + "reward": 3.2912168502807617, + "reward_std": 3.498786687850952, + "rewards/fitness_reward/mean": 3.485775947570801, + "rewards/fitness_reward/std": 3.9797747135162354, + "rewards/kidney_reward/mean": -0.08097781985998154, + "rewards/kidney_reward/std": 1.0017220973968506, + "rewards/length2tails_reward/mean": 0.45231589674949646, + "rewards/length2tails_reward/std": 0.4590526819229126, + "rewards/thermo_reward/mean": -0.5342980027198792, + "rewards/thermo_reward/std": 1.7394862174987793, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.15560346003621817, + "epoch": 0.076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8480332493782043, + "learning_rate": 1.48e-06, + "loss": 0.0411, + "num_tokens": 334929.0, + "reward": 3.250786304473877, + "reward_std": 3.4832067489624023, + "rewards/fitness_reward/mean": 3.512643814086914, + "rewards/fitness_reward/std": 3.6526308059692383, + "rewards/kidney_reward/mean": -0.3572557866573334, + "rewards/kidney_reward/std": 0.8072547912597656, + "rewards/length2tails_reward/mean": 0.4068295955657959, + "rewards/length2tails_reward/std": 0.46265122294425964, + "rewards/thermo_reward/mean": -0.3698734939098358, + "rewards/thermo_reward/std": 1.6630305051803589, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 281.3125, + "completions/mean_terminated_length": 281.3125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.15662717120721936, + "epoch": 0.078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.380427360534668, + "learning_rate": 1.5199999999999998e-06, + "loss": 0.1041, + "num_tokens": 343963.0, + "reward": 3.2305877208709717, + "reward_std": 3.6741504669189453, + "rewards/fitness_reward/mean": 3.7796645164489746, + "rewards/fitness_reward/std": 4.052594184875488, + "rewards/kidney_reward/mean": -0.5030925273895264, + "rewards/kidney_reward/std": 0.9221724271774292, + "rewards/length2tails_reward/mean": 0.4580508768558502, + "rewards/length2tails_reward/std": 0.4660351276397705, + "rewards/thermo_reward/mean": -0.8240861892700195, + "rewards/thermo_reward/std": 1.6861786842346191, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 260.53125, + "completions/mean_terminated_length": 260.53125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.08550065802410245, + "epoch": 0.08, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.308832049369812, + "learning_rate": 1.5599999999999999e-06, + "loss": -0.0705, + "num_tokens": 352332.0, + "reward": 3.550997734069824, + "reward_std": 3.215728282928467, + "rewards/fitness_reward/mean": 3.882805824279785, + "rewards/fitness_reward/std": 3.2661876678466797, + "rewards/kidney_reward/mean": -0.331551194190979, + "rewards/kidney_reward/std": 1.047247290611267, + "rewards/length2tails_reward/mean": 0.3643738627433777, + "rewards/length2tails_reward/std": 0.45199695229530334, + "rewards/thermo_reward/mean": -0.5142515301704407, + "rewards/thermo_reward/std": 1.6633535623550415, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 287.9375, + "completions/mean_terminated_length": 287.9375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.23963367426767945, + "epoch": 0.082, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.635361909866333, + "learning_rate": 1.6e-06, + "loss": 0.0942, + "num_tokens": 361578.0, + "reward": 2.2229952812194824, + "reward_std": 4.70599889755249, + "rewards/fitness_reward/mean": 2.4011287689208984, + "rewards/fitness_reward/std": 4.697346210479736, + "rewards/kidney_reward/mean": -0.21263191103935242, + "rewards/kidney_reward/std": 0.9460816383361816, + "rewards/length2tails_reward/mean": 0.49359941482543945, + "rewards/length2tails_reward/std": 0.4644700288772583, + "rewards/thermo_reward/mean": -0.39043474197387695, + "rewards/thermo_reward/std": 1.7590357065200806, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 264.3125, + "completions/mean_terminated_length": 264.3125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.17565014283172786, + "epoch": 0.084, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.304567813873291, + "learning_rate": 1.6399999999999998e-06, + "loss": -0.0273, + "num_tokens": 370068.0, + "reward": 2.681387424468994, + "reward_std": 2.717798948287964, + "rewards/fitness_reward/mean": 3.064466953277588, + "rewards/fitness_reward/std": 3.145963668823242, + "rewards/kidney_reward/mean": -0.30574914813041687, + "rewards/kidney_reward/std": 0.6779427528381348, + "rewards/length2tails_reward/mean": 0.283324658870697, + "rewards/length2tails_reward/std": 0.4304597079753876, + "rewards/thermo_reward/mean": -0.6020721197128296, + "rewards/thermo_reward/std": 1.501791000366211, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 265.625, + "completions/mean_terminated_length": 265.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.049185109324753284, + "epoch": 0.086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6598054766654968, + "learning_rate": 1.6799999999999998e-06, + "loss": -0.0068, + "num_tokens": 378600.0, + "reward": 3.6152701377868652, + "reward_std": 2.4673726558685303, + "rewards/fitness_reward/mean": 3.865413188934326, + "rewards/fitness_reward/std": 2.656116247177124, + "rewards/kidney_reward/mean": 0.025645185261964798, + "rewards/kidney_reward/std": 0.9594531059265137, + "rewards/length2tails_reward/mean": 0.2298400104045868, + "rewards/length2tails_reward/std": 0.4059338867664337, + "rewards/thermo_reward/mean": -0.6408505439758301, + "rewards/thermo_reward/std": 1.0837303400039673, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 284.875, + "completions/mean_terminated_length": 284.875, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.14746736688539386, + "epoch": 0.088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9718717336654663, + "learning_rate": 1.7199999999999998e-06, + "loss": 0.1251, + "num_tokens": 387748.0, + "reward": 3.236013650894165, + "reward_std": 3.3577160835266113, + "rewards/fitness_reward/mean": 3.322758913040161, + "rewards/fitness_reward/std": 4.015753269195557, + "rewards/kidney_reward/mean": -0.2718885838985443, + "rewards/kidney_reward/std": 1.0613834857940674, + "rewards/length2tails_reward/mean": 0.35513967275619507, + "rewards/length2tails_reward/std": 0.465355783700943, + "rewards/thermo_reward/mean": -0.07917150855064392, + "rewards/thermo_reward/std": 1.645909070968628, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 262.375, + "completions/mean_terminated_length": 262.375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.13994689658284187, + "epoch": 0.09, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.789834976196289, + "learning_rate": 1.7599999999999999e-06, + "loss": -0.0738, + "num_tokens": 396176.0, + "reward": 3.187227487564087, + "reward_std": 2.7353832721710205, + "rewards/fitness_reward/mean": 3.8397207260131836, + "rewards/fitness_reward/std": 3.054448366165161, + "rewards/kidney_reward/mean": -0.5340659618377686, + "rewards/kidney_reward/std": 0.7410010099411011, + "rewards/length2tails_reward/mean": 0.4532582461833954, + "rewards/length2tails_reward/std": 0.4908135235309601, + "rewards/thermo_reward/mean": -0.9975494146347046, + "rewards/thermo_reward/std": 1.4557503461837769, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 315.625, + "completions/mean_terminated_length": 301.4838562011719, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.3240436161868274, + "epoch": 0.092, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.263629913330078, + "learning_rate": 1.8e-06, + "loss": 0.2438, + "num_tokens": 406308.0, + "reward": 2.244203805923462, + "reward_std": 3.7210023403167725, + "rewards/fitness_reward/mean": 2.758302927017212, + "rewards/fitness_reward/std": 4.335577964782715, + "rewards/kidney_reward/mean": -0.517410397529602, + "rewards/kidney_reward/std": 0.6929426193237305, + "rewards/length2tails_reward/mean": 0.5104854702949524, + "rewards/length2tails_reward/std": 0.45414960384368896, + "rewards/thermo_reward/mean": -0.7660301923751831, + "rewards/thermo_reward/std": 1.8981391191482544, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 272.90625, + "completions/mean_terminated_length": 272.90625, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "entropy": 0.21957311406731606, + "epoch": 0.094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5725150108337402, + "learning_rate": 1.84e-06, + "loss": 0.0397, + "num_tokens": 415073.0, + "reward": 3.440593719482422, + "reward_std": 3.907878875732422, + "rewards/fitness_reward/mean": 3.6773455142974854, + "rewards/fitness_reward/std": 4.009237289428711, + "rewards/kidney_reward/mean": -0.3265707492828369, + "rewards/kidney_reward/std": 0.8724467754364014, + "rewards/length2tails_reward/mean": 0.4737565517425537, + "rewards/length2tails_reward/std": 0.47977402806282043, + "rewards/thermo_reward/mean": -0.38381147384643555, + "rewards/thermo_reward/std": 1.7817882299423218, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 280.71875, + "completions/mean_terminated_length": 280.71875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.20995492348447442, + "epoch": 0.096, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4390766620635986, + "learning_rate": 1.8799999999999998e-06, + "loss": 0.1414, + "num_tokens": 424088.0, + "reward": 2.7158737182617188, + "reward_std": 3.9450368881225586, + "rewards/fitness_reward/mean": 3.2037267684936523, + "rewards/fitness_reward/std": 4.233373165130615, + "rewards/kidney_reward/mean": -0.2726379632949829, + "rewards/kidney_reward/std": 1.0136029720306396, + "rewards/length2tails_reward/mean": 0.4437826871871948, + "rewards/length2tails_reward/std": 0.4791125953197479, + "rewards/thermo_reward/mean": -0.9249591827392578, + "rewards/thermo_reward/std": 1.5671353340148926, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 259.40625, + "completions/mean_terminated_length": 259.40625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.09386752406135201, + "epoch": 0.098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9598005414009094, + "learning_rate": 1.92e-06, + "loss": -0.0616, + "num_tokens": 432421.0, + "reward": 3.1751842498779297, + "reward_std": 2.8628969192504883, + "rewards/fitness_reward/mean": 3.399622678756714, + "rewards/fitness_reward/std": 3.3204360008239746, + "rewards/kidney_reward/mean": -0.05504148453474045, + "rewards/kidney_reward/std": 0.9885132312774658, + "rewards/length2tails_reward/mean": 0.3248463273048401, + "rewards/length2tails_reward/std": 0.42490923404693604, + "rewards/thermo_reward/mean": -0.5562581419944763, + "rewards/thermo_reward/std": 1.7294560670852661, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 276.375, + "completions/mean_terminated_length": 276.375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.2751795544754714, + "epoch": 0.1, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5895367860794067, + "learning_rate": 1.96e-06, + "loss": 0.0493, + "num_tokens": 441297.0, + "reward": 3.4684815406799316, + "reward_std": 3.365799903869629, + "rewards/fitness_reward/mean": 3.8252737522125244, + "rewards/fitness_reward/std": 3.701256513595581, + "rewards/kidney_reward/mean": -0.21321097016334534, + "rewards/kidney_reward/std": 1.0331931114196777, + "rewards/length2tails_reward/mean": 0.5216790437698364, + "rewards/length2tails_reward/std": 0.475973904132843, + "rewards/thermo_reward/mean": -0.7612127065658569, + "rewards/thermo_reward/std": 1.7431429624557495, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 275.6875, + "completions/mean_terminated_length": 275.6875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.16012710286304355, + "epoch": 0.102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3491517305374146, + "learning_rate": 2e-06, + "loss": 0.0725, + "num_tokens": 450151.0, + "reward": 3.006770133972168, + "reward_std": 3.639115810394287, + "rewards/fitness_reward/mean": 3.219278335571289, + "rewards/fitness_reward/std": 3.9509146213531494, + "rewards/kidney_reward/mean": -0.12363478541374207, + "rewards/kidney_reward/std": 1.1107032299041748, + "rewards/length2tails_reward/mean": 0.3590978980064392, + "rewards/length2tails_reward/std": 0.4454381763935089, + "rewards/thermo_reward/mean": -0.48093003034591675, + "rewards/thermo_reward/std": 1.5363606214523315, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 275.875, + "completions/mean_terminated_length": 275.875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.15937610948458314, + "epoch": 0.104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.491953730583191, + "learning_rate": 1.9999756307053944e-06, + "loss": 0.0897, + "num_tokens": 459011.0, + "reward": 2.9640777111053467, + "reward_std": 3.2988362312316895, + "rewards/fitness_reward/mean": 3.421854257583618, + "rewards/fitness_reward/std": 3.4940311908721924, + "rewards/kidney_reward/mean": -0.35488569736480713, + "rewards/kidney_reward/std": 0.8144198060035706, + "rewards/length2tails_reward/mean": 0.32573986053466797, + "rewards/length2tails_reward/std": 0.43565720319747925, + "rewards/thermo_reward/mean": -0.7235372066497803, + "rewards/thermo_reward/std": 1.4692848920822144, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 291.40625, + "completions/mean_terminated_length": 276.4838562011719, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.17189960647374392, + "epoch": 0.106, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6252596378326416, + "learning_rate": 1.999902524009304e-06, + "loss": 0.1744, + "num_tokens": 468368.0, + "reward": 3.256671667098999, + "reward_std": 3.157939910888672, + "rewards/fitness_reward/mean": 3.558439016342163, + "rewards/fitness_reward/std": 3.760011672973633, + "rewards/kidney_reward/mean": -0.20778009295463562, + "rewards/kidney_reward/std": 0.9953776597976685, + "rewards/length2tails_reward/mean": 0.41051942110061646, + "rewards/length2tails_reward/std": 0.47235342860221863, + "rewards/thermo_reward/mean": -0.6010138988494873, + "rewards/thermo_reward/std": 1.5003546476364136, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 267.5, + "completions/mean_terminated_length": 267.5, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.18772483337670565, + "epoch": 0.108, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5053292512893677, + "learning_rate": 1.999780683474845e-06, + "loss": 0.0136, + "num_tokens": 476960.0, + "reward": 3.904463529586792, + "reward_std": 3.229468822479248, + "rewards/fitness_reward/mean": 4.0994086265563965, + "rewards/fitness_reward/std": 3.267956018447876, + "rewards/kidney_reward/mean": 0.038502246141433716, + "rewards/kidney_reward/std": 1.0834846496582031, + "rewards/length2tails_reward/mean": 0.42780017852783203, + "rewards/length2tails_reward/std": 0.4738711416721344, + "rewards/thermo_reward/mean": -0.6422922611236572, + "rewards/thermo_reward/std": 1.6311054229736328, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 280.375, + "completions/mean_terminated_length": 280.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.16723995190113783, + "epoch": 0.11, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2620344161987305, + "learning_rate": 1.999610115040354e-06, + "loss": 0.0829, + "num_tokens": 485964.0, + "reward": 2.829023838043213, + "reward_std": 3.2344512939453125, + "rewards/fitness_reward/mean": 3.2663230895996094, + "rewards/fitness_reward/std": 3.6452345848083496, + "rewards/kidney_reward/mean": -0.43373996019363403, + "rewards/kidney_reward/std": 0.8998933434486389, + "rewards/length2tails_reward/mean": 0.39806175231933594, + "rewards/length2tails_reward/std": 0.4529305696487427, + "rewards/thermo_reward/mean": -0.6398894786834717, + "rewards/thermo_reward/std": 1.7521892786026, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.15543030109256506, + "epoch": 0.112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3704290390014648, + "learning_rate": 1.9993908270190957e-06, + "loss": 0.05, + "num_tokens": 494708.0, + "reward": 2.963658571243286, + "reward_std": 2.9826788902282715, + "rewards/fitness_reward/mean": 3.5461554527282715, + "rewards/fitness_reward/std": 3.4882194995880127, + "rewards/kidney_reward/mean": -0.32047683000564575, + "rewards/kidney_reward/std": 0.8685289621353149, + "rewards/length2tails_reward/mean": 0.405619353055954, + "rewards/length2tails_reward/std": 0.4598184823989868, + "rewards/thermo_reward/mean": -1.0473260879516602, + "rewards/thermo_reward/std": 1.5689518451690674, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 286.75, + "completions/mean_terminated_length": 286.75, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.20465043978765607, + "epoch": 0.114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5429962873458862, + "learning_rate": 1.999122830098858e-06, + "loss": 0.1211, + "num_tokens": 503916.0, + "reward": 2.5245821475982666, + "reward_std": 3.371739625930786, + "rewards/fitness_reward/mean": 2.8426780700683594, + "rewards/fitness_reward/std": 3.971247434616089, + "rewards/kidney_reward/mean": -0.447263240814209, + "rewards/kidney_reward/std": 0.7793542146682739, + "rewards/length2tails_reward/mean": 0.41784876585006714, + "rewards/length2tails_reward/std": 0.4716041386127472, + "rewards/thermo_reward/mean": -0.39785265922546387, + "rewards/thermo_reward/std": 1.7068020105361938, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 283.96875, + "completions/mean_terminated_length": 283.96875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10966742131859064, + "epoch": 0.116, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2801357507705688, + "learning_rate": 1.998806137341434e-06, + "loss": 0.1257, + "num_tokens": 513035.0, + "reward": 2.501692771911621, + "reward_std": 3.395608425140381, + "rewards/fitness_reward/mean": 2.803565263748169, + "rewards/fitness_reward/std": 3.8261919021606445, + "rewards/kidney_reward/mean": -0.06101062148809433, + "rewards/kidney_reward/std": 0.9440979361534119, + "rewards/length2tails_reward/mean": 0.30113983154296875, + "rewards/length2tails_reward/std": 0.4402500092983246, + "rewards/thermo_reward/mean": -0.6933040022850037, + "rewards/thermo_reward/std": 1.348304033279419, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 266.46875, + "completions/mean_terminated_length": 266.46875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.10635471204295754, + "epoch": 0.118, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.288246750831604, + "learning_rate": 1.998440764181981e-06, + "loss": -0.0024, + "num_tokens": 521594.0, + "reward": 2.6305971145629883, + "reward_std": 3.2401773929595947, + "rewards/fitness_reward/mean": 3.008155345916748, + "rewards/fitness_reward/std": 3.672124147415161, + "rewards/kidney_reward/mean": -0.308817982673645, + "rewards/kidney_reward/std": 0.7729810476303101, + "rewards/length2tails_reward/mean": 0.29512399435043335, + "rewards/length2tails_reward/std": 0.42243650555610657, + "rewards/thermo_reward/mean": -0.5938605666160583, + "rewards/thermo_reward/std": 1.6341571807861328, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12811401998624206, + "epoch": 0.12, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5416864156723022, + "learning_rate": 1.9980267284282714e-06, + "loss": 0.0415, + "num_tokens": 530305.0, + "reward": 3.8451623916625977, + "reward_std": 2.8841209411621094, + "rewards/fitness_reward/mean": 4.159316062927246, + "rewards/fitness_reward/std": 2.810502767562866, + "rewards/kidney_reward/mean": -0.11423890292644501, + "rewards/kidney_reward/std": 0.9905949831008911, + "rewards/length2tails_reward/mean": 0.29447993636131287, + "rewards/length2tails_reward/std": 0.4234199523925781, + "rewards/thermo_reward/mean": -0.6613084077835083, + "rewards/thermo_reward/std": 1.4590983390808105, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.21011533495038748, + "epoch": 0.122, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1377606391906738, + "learning_rate": 1.997564050259824e-06, + "loss": -0.0385, + "num_tokens": 538984.0, + "reward": 3.4698257446289062, + "reward_std": 2.9833648204803467, + "rewards/fitness_reward/mean": 3.722118616104126, + "rewards/fitness_reward/std": 3.0271809101104736, + "rewards/kidney_reward/mean": -0.19583800435066223, + "rewards/kidney_reward/std": 0.8872043490409851, + "rewards/length2tails_reward/mean": 0.35539913177490234, + "rewards/length2tails_reward/std": 0.45690247416496277, + "rewards/thermo_reward/mean": -0.4864477515220642, + "rewards/thermo_reward/std": 1.4413105249404907, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 274.65625, + "completions/mean_terminated_length": 274.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1759026509243995, + "epoch": 0.124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6390390396118164, + "learning_rate": 1.99705275222692e-06, + "loss": 0.0489, + "num_tokens": 547805.0, + "reward": 2.411684036254883, + "reward_std": 3.4048962593078613, + "rewards/fitness_reward/mean": 2.772763729095459, + "rewards/fitness_reward/std": 3.615152597427368, + "rewards/kidney_reward/mean": -0.2250068634748459, + "rewards/kidney_reward/std": 0.8008552193641663, + "rewards/length2tails_reward/mean": 0.38801515102386475, + "rewards/length2tails_reward/std": 0.4778028428554535, + "rewards/thermo_reward/mean": -0.6911601424217224, + "rewards/thermo_reward/std": 1.3928868770599365, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 257.09375, + "completions/mean_terminated_length": 257.09375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.12114082940388471, + "epoch": 0.126, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0728001594543457, + "learning_rate": 1.9964928592495045e-06, + "loss": -0.0602, + "num_tokens": 556064.0, + "reward": 2.318950653076172, + "reward_std": 3.8857407569885254, + "rewards/fitness_reward/mean": 2.7293825149536133, + "rewards/fitness_reward/std": 3.9336018562316895, + "rewards/kidney_reward/mean": -0.12004195153713226, + "rewards/kidney_reward/std": 1.0272102355957031, + "rewards/length2tails_reward/mean": 0.3241163492202759, + "rewards/length2tails_reward/std": 0.4461221396923065, + "rewards/thermo_reward/mean": -0.8628798723220825, + "rewards/thermo_reward/std": 1.4122726917266846, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 266.34375, + "completions/mean_terminated_length": 266.34375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.09513700450770557, + "epoch": 0.128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4474369287490845, + "learning_rate": 1.99588439861597e-06, + "loss": -0.0035, + "num_tokens": 564619.0, + "reward": 2.8130290508270264, + "reward_std": 2.8655166625976562, + "rewards/fitness_reward/mean": 3.189012289047241, + "rewards/fitness_reward/std": 3.4911272525787354, + "rewards/kidney_reward/mean": -0.19488850235939026, + "rewards/kidney_reward/std": 0.8539829254150391, + "rewards/length2tails_reward/mean": 0.3629145622253418, + "rewards/length2tails_reward/std": 0.4654383659362793, + "rewards/thermo_reward/mean": -0.7385349273681641, + "rewards/thermo_reward/std": 1.4404512643814087, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 273.71875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.13064821506850421, + "epoch": 0.13, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.423954963684082, + "learning_rate": 1.995227399981831e-06, + "loss": 0.0646, + "num_tokens": 573410.0, + "reward": 2.9144697189331055, + "reward_std": 2.6013689041137695, + "rewards/fitness_reward/mean": 3.6918673515319824, + "rewards/fitness_reward/std": 3.162837266921997, + "rewards/kidney_reward/mean": -0.5046856999397278, + "rewards/kidney_reward/std": 0.8479384779930115, + "rewards/length2tails_reward/mean": 0.39057034254074097, + "rewards/length2tails_reward/std": 0.47401538491249084, + "rewards/thermo_reward/mean": -1.2453944683074951, + "rewards/thermo_reward/std": 1.5381916761398315, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 263.75, + "completions/mean_terminated_length": 263.75, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.16724270419217646, + "epoch": 0.132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4178781509399414, + "learning_rate": 1.994521895368273e-06, + "loss": -0.0306, + "num_tokens": 581882.0, + "reward": 3.1069273948669434, + "reward_std": 3.656845808029175, + "rewards/fitness_reward/mean": 3.6763453483581543, + "rewards/fitness_reward/std": 3.7498345375061035, + "rewards/kidney_reward/mean": -0.46478116512298584, + "rewards/kidney_reward/std": 0.780349612236023, + "rewards/length2tails_reward/mean": 0.4894040822982788, + "rewards/length2tails_reward/std": 0.4659845530986786, + "rewards/thermo_reward/mean": -0.9187557101249695, + "rewards/thermo_reward/std": 1.7948275804519653, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 263.46875, + "completions/mean_terminated_length": 263.46875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.047335606650449336, + "epoch": 0.134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.666408360004425, + "learning_rate": 1.9937679191605962e-06, + "loss": -0.0395, + "num_tokens": 590345.0, + "reward": 3.4559826850891113, + "reward_std": 2.2802734375, + "rewards/fitness_reward/mean": 4.01068115234375, + "rewards/fitness_reward/std": 2.509319305419922, + "rewards/kidney_reward/mean": -0.4572184383869171, + "rewards/kidney_reward/std": 0.5606120228767395, + "rewards/length2tails_reward/mean": 0.30433884263038635, + "rewards/length2tails_reward/std": 0.44804126024246216, + "rewards/thermo_reward/mean": -0.8043481111526489, + "rewards/thermo_reward/std": 1.5644283294677734, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 249.03125, + "completions/mean_terminated_length": 249.03125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.12496072240173817, + "epoch": 0.136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8123031258583069, + "learning_rate": 1.992965508106537e-06, + "loss": -0.1554, + "num_tokens": 598346.0, + "reward": 3.622310161590576, + "reward_std": 3.287194013595581, + "rewards/fitness_reward/mean": 3.8683528900146484, + "rewards/fitness_reward/std": 3.616657018661499, + "rewards/kidney_reward/mean": -0.33187222480773926, + "rewards/kidney_reward/std": 0.8844914436340332, + "rewards/length2tails_reward/mean": 0.40272101759910583, + "rewards/length2tails_reward/std": 0.4587315022945404, + "rewards/thermo_reward/mean": -0.3615736961364746, + "rewards/thermo_reward/std": 1.7587852478027344, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.06588866747915745, + "epoch": 0.138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8075075149536133, + "learning_rate": 1.9921147013144777e-06, + "loss": 0.084, + "num_tokens": 607094.0, + "reward": 3.5608577728271484, + "reward_std": 2.032958745956421, + "rewards/fitness_reward/mean": 3.9016268253326416, + "rewards/fitness_reward/std": 1.998185634613037, + "rewards/kidney_reward/mean": -0.2879283130168915, + "rewards/kidney_reward/std": 0.7849275469779968, + "rewards/length2tails_reward/mean": 0.1897910237312317, + "rewards/length2tails_reward/std": 0.3825874924659729, + "rewards/thermo_reward/mean": -0.488505482673645, + "rewards/thermo_reward/std": 1.2325689792633057, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 280.0, + "completions/mean_terminated_length": 280.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.1318660757970065, + "epoch": 0.14, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7601933479309082, + "learning_rate": 1.9912155402515414e-06, + "loss": 0.0819, + "num_tokens": 616086.0, + "reward": 3.302825927734375, + "reward_std": 3.520420789718628, + "rewards/fitness_reward/mean": 3.4547269344329834, + "rewards/fitness_reward/std": 4.018453598022461, + "rewards/kidney_reward/mean": -0.33877238631248474, + "rewards/kidney_reward/std": 1.066430687904358, + "rewards/length2tails_reward/mean": 0.5047099590301514, + "rewards/length2tails_reward/std": 0.45460987091064453, + "rewards/thermo_reward/mean": -0.21738454699516296, + "rewards/thermo_reward/std": 1.8911807537078857, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 281.125, + "completions/mean_terminated_length": 281.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12423830945044756, + "epoch": 0.142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3924140930175781, + "learning_rate": 1.99026806874157e-06, + "loss": 0.1156, + "num_tokens": 625114.0, + "reward": 3.2361419200897217, + "reward_std": 3.2574002742767334, + "rewards/fitness_reward/mean": 3.6235194206237793, + "rewards/fitness_reward/std": 3.311436176300049, + "rewards/kidney_reward/mean": -0.20599408447742462, + "rewards/kidney_reward/std": 0.8880826830863953, + "rewards/length2tails_reward/mean": 0.3809414505958557, + "rewards/length2tails_reward/std": 0.4668664038181305, + "rewards/thermo_reward/mean": -0.7592315673828125, + "rewards/thermo_reward/std": 1.6003990173339844, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 273.75, + "completions/mean_terminated_length": 273.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09949495655018836, + "epoch": 0.144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9823671579360962, + "learning_rate": 1.9892723329629885e-06, + "loss": 0.0914, + "num_tokens": 633906.0, + "reward": 3.915363073348999, + "reward_std": 2.6797749996185303, + "rewards/fitness_reward/mean": 4.155492782592773, + "rewards/fitness_reward/std": 2.356550455093384, + "rewards/kidney_reward/mean": -0.17341646552085876, + "rewards/kidney_reward/std": 0.8984330892562866, + "rewards/length2tails_reward/mean": 0.26437491178512573, + "rewards/length2tails_reward/std": 0.3996802270412445, + "rewards/thermo_reward/mean": -0.43903061747550964, + "rewards/thermo_reward/std": 1.5646251440048218, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 267.6875, + "completions/mean_terminated_length": 267.6875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.10478552733547986, + "epoch": 0.146, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3388633728027344, + "learning_rate": 1.9882283814465526e-06, + "loss": -0.0059, + "num_tokens": 642504.0, + "reward": 2.3275299072265625, + "reward_std": 3.3928511142730713, + "rewards/fitness_reward/mean": 2.9948863983154297, + "rewards/fitness_reward/std": 3.664214849472046, + "rewards/kidney_reward/mean": -0.5589306354522705, + "rewards/kidney_reward/std": 0.6325205564498901, + "rewards/length2tails_reward/mean": 0.3547426760196686, + "rewards/length2tails_reward/std": 0.4702369272708893, + "rewards/thermo_reward/mean": -0.9531527757644653, + "rewards/thermo_reward/std": 1.475315809249878, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 265.6875, + "completions/mean_terminated_length": 265.6875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.05924345087260008, + "epoch": 0.148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8762544989585876, + "learning_rate": 1.987136265072988e-06, + "loss": -0.0119, + "num_tokens": 651038.0, + "reward": 3.500439167022705, + "reward_std": 3.052443265914917, + "rewards/fitness_reward/mean": 4.001605033874512, + "rewards/fitness_reward/std": 2.928560733795166, + "rewards/kidney_reward/mean": -0.3237060010433197, + "rewards/kidney_reward/std": 0.9022273421287537, + "rewards/length2tails_reward/mean": 0.34477800130844116, + "rewards/length2tails_reward/std": 0.44316500425338745, + "rewards/thermo_reward/mean": -0.8510144948959351, + "rewards/thermo_reward/std": 1.473467469215393, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 273.3125, + "completions/mean_terminated_length": 273.3125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.38166644517332315, + "epoch": 0.15, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.392104983329773, + "learning_rate": 1.985996037070505e-06, + "loss": -0.0404, + "num_tokens": 659816.0, + "reward": 3.276812791824341, + "reward_std": 3.1364307403564453, + "rewards/fitness_reward/mean": 3.4767651557922363, + "rewards/fitness_reward/std": 3.71610689163208, + "rewards/kidney_reward/mean": -0.04081631079316139, + "rewards/kidney_reward/std": 0.9693803191184998, + "rewards/length2tails_reward/mean": 0.48275813460350037, + "rewards/length2tails_reward/std": 0.4699109196662903, + "rewards/thermo_reward/mean": -0.6004676818847656, + "rewards/thermo_reward/std": 1.6504100561141968, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 265.03125, + "completions/mean_terminated_length": 265.03125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.09045771509408951, + "epoch": 0.152, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0095329284667969, + "learning_rate": 1.984807753012208e-06, + "loss": -0.02, + "num_tokens": 668329.0, + "reward": 3.8370518684387207, + "reward_std": 2.9943735599517822, + "rewards/fitness_reward/mean": 4.162042617797852, + "rewards/fitness_reward/std": 3.4031171798706055, + "rewards/kidney_reward/mean": -0.1037202849984169, + "rewards/kidney_reward/std": 0.8663031458854675, + "rewards/length2tails_reward/mean": 0.40801501274108887, + "rewards/length2tails_reward/std": 0.46140018105506897, + "rewards/thermo_reward/mean": -0.7502682209014893, + "rewards/thermo_reward/std": 1.5727473497390747, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 263.96875, + "completions/mean_terminated_length": 263.96875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.07936325226910412, + "epoch": 0.154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5812162160873413, + "learning_rate": 1.983571470813386e-06, + "loss": -0.0325, + "num_tokens": 676808.0, + "reward": 3.7221837043762207, + "reward_std": 2.660025119781494, + "rewards/fitness_reward/mean": 4.209771156311035, + "rewards/fitness_reward/std": 2.9468605518341064, + "rewards/kidney_reward/mean": -0.5054516196250916, + "rewards/kidney_reward/std": 0.7025943398475647, + "rewards/length2tails_reward/mean": 0.4130633771419525, + "rewards/length2tails_reward/std": 0.4388565123081207, + "rewards/thermo_reward/mean": -0.6762548089027405, + "rewards/thermo_reward/std": 1.6007850170135498, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 260.59375, + "completions/mean_terminated_length": 260.59375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.054782358231022954, + "epoch": 0.156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.408692479133606, + "learning_rate": 1.9822872507286887e-06, + "loss": -0.077, + "num_tokens": 685179.0, + "reward": 3.3060789108276367, + "reward_std": 2.6914637088775635, + "rewards/fitness_reward/mean": 3.7634127140045166, + "rewards/fitness_reward/std": 2.9260480403900146, + "rewards/kidney_reward/mean": -0.3477899730205536, + "rewards/kidney_reward/std": 0.8083656430244446, + "rewards/length2tails_reward/mean": 0.30918532609939575, + "rewards/length2tails_reward/std": 0.4387562870979309, + "rewards/thermo_reward/mean": -0.7214701175689697, + "rewards/thermo_reward/std": 1.4108153581619263, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 278.375, + "completions/mean_terminated_length": 278.375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.1216822536662221, + "epoch": 0.158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7516916990280151, + "learning_rate": 1.9809551553491913e-06, + "loss": 0.0891, + "num_tokens": 694119.0, + "reward": 3.180351972579956, + "reward_std": 3.2595674991607666, + "rewards/fitness_reward/mean": 3.630638599395752, + "rewards/fitness_reward/std": 3.309762477874756, + "rewards/kidney_reward/mean": -0.2499833106994629, + "rewards/kidney_reward/std": 1.0745337009429932, + "rewards/length2tails_reward/mean": 0.4155610203742981, + "rewards/length2tails_reward/std": 0.47090789675712585, + "rewards/thermo_reward/mean": -0.8583704233169556, + "rewards/thermo_reward/std": 1.5894389152526855, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 265.625, + "completions/mean_terminated_length": 265.625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.06150074442848563, + "epoch": 0.16, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0668549537658691, + "learning_rate": 1.979575249599344e-06, + "loss": -0.0093, + "num_tokens": 702651.0, + "reward": 3.1801319122314453, + "reward_std": 2.3890247344970703, + "rewards/fitness_reward/mean": 3.6655666828155518, + "rewards/fitness_reward/std": 2.888410806655884, + "rewards/kidney_reward/mean": -0.5205367803573608, + "rewards/kidney_reward/std": 0.7554165124893188, + "rewards/length2tails_reward/mean": 0.30073481798171997, + "rewards/length2tails_reward/std": 0.43407922983169556, + "rewards/thermo_reward/mean": -0.600700318813324, + "rewards/thermo_reward/std": 1.570206880569458, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 278.59375, + "completions/mean_terminated_length": 278.59375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.19582223054021597, + "epoch": 0.162, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9738500118255615, + "learning_rate": 1.9781476007338054e-06, + "loss": 0.0849, + "num_tokens": 711598.0, + "reward": 3.531404495239258, + "reward_std": 2.798060417175293, + "rewards/fitness_reward/mean": 4.016422748565674, + "rewards/fitness_reward/std": 3.2364284992218018, + "rewards/kidney_reward/mean": -0.40827667713165283, + "rewards/kidney_reward/std": 0.8911892771720886, + "rewards/length2tails_reward/mean": 0.4315437376499176, + "rewards/length2tails_reward/std": 0.46955406665802, + "rewards/thermo_reward/mean": -0.7775313258171082, + "rewards/thermo_reward/std": 1.5042880773544312, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 259.2903137207031, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.09533343114890158, + "epoch": 0.164, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.523869037628174, + "learning_rate": 1.9766722783341677e-06, + "loss": 0.086, + "num_tokens": 720422.0, + "reward": 3.6216368675231934, + "reward_std": 2.9999029636383057, + "rewards/fitness_reward/mean": 4.240582466125488, + "rewards/fitness_reward/std": 3.2040698528289795, + "rewards/kidney_reward/mean": -0.39840295910835266, + "rewards/kidney_reward/std": 1.0155638456344604, + "rewards/length2tails_reward/mean": 0.434470534324646, + "rewards/length2tails_reward/std": 0.46228936314582825, + "rewards/thermo_reward/mean": -1.0567247867584229, + "rewards/thermo_reward/std": 1.665966272354126, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 266.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0562637432012707, + "epoch": 0.166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1357522010803223, + "learning_rate": 1.975149354305563e-06, + "loss": 0.0135, + "num_tokens": 728994.0, + "reward": 3.007725715637207, + "reward_std": 2.753161668777466, + "rewards/fitness_reward/mean": 3.52304744720459, + "rewards/fitness_reward/std": 2.9334521293640137, + "rewards/kidney_reward/mean": -0.41844168305397034, + "rewards/kidney_reward/std": 0.6413934230804443, + "rewards/length2tails_reward/mean": 0.2917965352535248, + "rewards/length2tails_reward/std": 0.4327235817909241, + "rewards/thermo_reward/mean": -0.7580999135971069, + "rewards/thermo_reward/std": 1.4481271505355835, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 286.65625, + "completions/mean_terminated_length": 286.65625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13436559145338833, + "epoch": 0.168, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7011663913726807, + "learning_rate": 1.97357890287316e-06, + "loss": 0.1895, + "num_tokens": 738199.0, + "reward": 4.253416538238525, + "reward_std": 2.8524696826934814, + "rewards/fitness_reward/mean": 4.332322597503662, + "rewards/fitness_reward/std": 2.9005343914031982, + "rewards/kidney_reward/mean": -0.07999514043331146, + "rewards/kidney_reward/std": 1.0148781538009644, + "rewards/length2tails_reward/mean": 0.33597058057785034, + "rewards/length2tails_reward/std": 0.43135184049606323, + "rewards/thermo_reward/mean": -0.24580200016498566, + "rewards/thermo_reward/std": 1.51665461063385, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 282.0625, + "completions/mean_terminated_length": 282.0625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.17252697050571442, + "epoch": 0.17, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1756560802459717, + "learning_rate": 1.9719610005785463e-06, + "loss": 0.1212, + "num_tokens": 747257.0, + "reward": 2.786454200744629, + "reward_std": 3.7640724182128906, + "rewards/fitness_reward/mean": 3.159298896789551, + "rewards/fitness_reward/std": 4.33671236038208, + "rewards/kidney_reward/mean": -0.3573535680770874, + "rewards/kidney_reward/std": 0.8194970488548279, + "rewards/length2tails_reward/mean": 0.4246785044670105, + "rewards/length2tails_reward/std": 0.48692333698272705, + "rewards/thermo_reward/mean": -0.600675106048584, + "rewards/thermo_reward/std": 1.7436631917953491, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 287.21875, + "completions/mean_terminated_length": 287.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12859952612780035, + "epoch": 0.172, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2280685901641846, + "learning_rate": 1.9702957262759963e-06, + "loss": 0.1583, + "num_tokens": 756480.0, + "reward": 2.5727219581604004, + "reward_std": 3.09424090385437, + "rewards/fitness_reward/mean": 2.9395320415496826, + "rewards/fitness_reward/std": 3.4791810512542725, + "rewards/kidney_reward/mean": -0.23735620081424713, + "rewards/kidney_reward/std": 0.9329817891120911, + "rewards/length2tails_reward/mean": 0.30392399430274963, + "rewards/length2tails_reward/std": 0.45537668466567993, + "rewards/thermo_reward/mean": -0.6482260227203369, + "rewards/thermo_reward/std": 1.1679952144622803, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.12523282784968615, + "epoch": 0.174, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.673491954803467, + "learning_rate": 1.968583161128631e-06, + "loss": 0.0818, + "num_tokens": 765276.0, + "reward": 3.5213098526000977, + "reward_std": 2.275979518890381, + "rewards/fitness_reward/mean": 4.062276363372803, + "rewards/fitness_reward/std": 2.692859172821045, + "rewards/kidney_reward/mean": -0.4449991285800934, + "rewards/kidney_reward/std": 0.7791681885719299, + "rewards/length2tails_reward/mean": 0.3919871151447296, + "rewards/length2tails_reward/std": 0.4710420072078705, + "rewards/thermo_reward/mean": -0.8329271674156189, + "rewards/thermo_reward/std": 1.5729801654815674, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 276.0625, + "completions/mean_terminated_length": 276.0625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0831142186652869, + "epoch": 0.176, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.926175594329834, + "learning_rate": 1.9668233886044593e-06, + "loss": 0.0891, + "num_tokens": 774142.0, + "reward": 3.4537410736083984, + "reward_std": 2.8645174503326416, + "rewards/fitness_reward/mean": 3.9672062397003174, + "rewards/fitness_reward/std": 3.019857883453369, + "rewards/kidney_reward/mean": -0.44299253821372986, + "rewards/kidney_reward/std": 0.9003056287765503, + "rewards/length2tails_reward/mean": 0.3812631368637085, + "rewards/length2tails_reward/std": 0.455536812543869, + "rewards/thermo_reward/mean": -0.7745689153671265, + "rewards/thermo_reward/std": 1.6042786836624146, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 281.78125, + "completions/mean_terminated_length": 266.5483703613281, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.09161855978891253, + "epoch": 0.178, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.947537422180176, + "learning_rate": 1.9650164944723112e-06, + "loss": 0.123, + "num_tokens": 783191.0, + "reward": 3.7828750610351562, + "reward_std": 2.4898288249969482, + "rewards/fitness_reward/mean": 4.1685638427734375, + "rewards/fitness_reward/std": 2.7823469638824463, + "rewards/kidney_reward/mean": -0.2978300452232361, + "rewards/kidney_reward/std": 0.8650789856910706, + "rewards/length2tails_reward/mean": 0.3725648522377014, + "rewards/length2tails_reward/std": 0.44893646240234375, + "rewards/thermo_reward/mean": -0.6598291397094727, + "rewards/thermo_reward/std": 1.685317873954773, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 266.84375, + "completions/mean_terminated_length": 266.84375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.07641523564234376, + "epoch": 0.18, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4239964485168457, + "learning_rate": 1.963162566797658e-06, + "loss": -0.01, + "num_tokens": 791762.0, + "reward": 3.6204564571380615, + "reward_std": 3.3779494762420654, + "rewards/fitness_reward/mean": 4.185644149780273, + "rewards/fitness_reward/std": 3.01949405670166, + "rewards/kidney_reward/mean": -0.34998631477355957, + "rewards/kidney_reward/std": 0.845792829990387, + "rewards/length2tails_reward/mean": 0.3621581792831421, + "rewards/length2tails_reward/std": 0.44562968611717224, + "rewards/thermo_reward/mean": -0.961467981338501, + "rewards/thermo_reward/std": 1.495065450668335, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 277.8125, + "completions/mean_terminated_length": 277.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.083618309115991, + "epoch": 0.182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8104112148284912, + "learning_rate": 1.9612616959383188e-06, + "loss": 0.093, + "num_tokens": 800684.0, + "reward": 3.970524787902832, + "reward_std": 2.618279218673706, + "rewards/fitness_reward/mean": 4.449347496032715, + "rewards/fitness_reward/std": 2.936826467514038, + "rewards/kidney_reward/mean": -0.2391345500946045, + "rewards/kidney_reward/std": 1.0095757246017456, + "rewards/length2tails_reward/mean": 0.45980584621429443, + "rewards/length2tails_reward/std": 0.45448037981987, + "rewards/thermo_reward/mean": -0.9484134316444397, + "rewards/thermo_reward/std": 1.801936388015747, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 266.90625, + "completions/mean_terminated_length": 266.90625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.05782184097915888, + "epoch": 0.184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44250741600990295, + "learning_rate": 1.9593139745400573e-06, + "loss": -0.0085, + "num_tokens": 809257.0, + "reward": 4.371463298797607, + "reward_std": 1.7807085514068604, + "rewards/fitness_reward/mean": 4.943473815917969, + "rewards/fitness_reward/std": 1.6741914749145508, + "rewards/kidney_reward/mean": -0.32885757088661194, + "rewards/kidney_reward/std": 0.9096606373786926, + "rewards/length2tails_reward/mean": 0.3909035325050354, + "rewards/length2tails_reward/std": 0.4627934992313385, + "rewards/thermo_reward/mean": -1.0106147527694702, + "rewards/thermo_reward/std": 1.459147572517395, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 265.59375, + "completions/mean_terminated_length": 265.59375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.13330286438576877, + "epoch": 0.186, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.56676983833313, + "learning_rate": 1.957319497532067e-06, + "loss": -0.0201, + "num_tokens": 817788.0, + "reward": 3.7524971961975098, + "reward_std": 2.9713351726531982, + "rewards/fitness_reward/mean": 4.150823593139648, + "rewards/fitness_reward/std": 3.1422393321990967, + "rewards/kidney_reward/mean": -0.12235195934772491, + "rewards/kidney_reward/std": 1.0004616975784302, + "rewards/length2tails_reward/mean": 0.4478749632835388, + "rewards/length2tails_reward/std": 0.4696106016635895, + "rewards/thermo_reward/mean": -0.898237943649292, + "rewards/thermo_reward/std": 1.6778795719146729, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 266.84375, + "completions/mean_terminated_length": 266.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.05517435655929148, + "epoch": 0.188, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18077127635478973, + "learning_rate": 1.9552783621223435e-06, + "loss": -0.0066, + "num_tokens": 826359.0, + "reward": 4.230867862701416, + "reward_std": 1.6108834743499756, + "rewards/fitness_reward/mean": 4.840484619140625, + "rewards/fitness_reward/std": 1.6709182262420654, + "rewards/kidney_reward/mean": -0.1514122188091278, + "rewards/kidney_reward/std": 0.9166415333747864, + "rewards/length2tails_reward/mean": 0.3408389687538147, + "rewards/length2tails_reward/std": 0.445056289434433, + "rewards/thermo_reward/mean": -1.2382405996322632, + "rewards/thermo_reward/std": 1.488350749015808, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 265.9375, + "completions/mean_terminated_length": 265.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.06825397931970656, + "epoch": 0.19, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9398273229598999, + "learning_rate": 1.953190667792947e-06, + "loss": -0.0041, + "num_tokens": 834901.0, + "reward": 3.9511187076568604, + "reward_std": 2.8120744228363037, + "rewards/fitness_reward/mean": 4.169844150543213, + "rewards/fitness_reward/std": 2.7375926971435547, + "rewards/kidney_reward/mean": -0.06316429376602173, + "rewards/kidney_reward/std": 1.0275908708572388, + "rewards/length2tails_reward/mean": 0.2595304250717163, + "rewards/length2tails_reward/std": 0.3988112211227417, + "rewards/thermo_reward/mean": -0.5040514469146729, + "rewards/thermo_reward/std": 1.4217562675476074, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.07531367521733046, + "epoch": 0.192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9723393321037292, + "learning_rate": 1.9510565162951534e-06, + "loss": 0.0485, + "num_tokens": 843720.0, + "reward": 3.428475856781006, + "reward_std": 3.01780104637146, + "rewards/fitness_reward/mean": 3.9550271034240723, + "rewards/fitness_reward/std": 3.3777401447296143, + "rewards/kidney_reward/mean": -0.39997780323028564, + "rewards/kidney_reward/std": 0.961540162563324, + "rewards/length2tails_reward/mean": 0.4574941396713257, + "rewards/length2tails_reward/std": 0.46799859404563904, + "rewards/thermo_reward/mean": -0.8818715810775757, + "rewards/thermo_reward/std": 1.7305375337600708, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 275.0, + "completions/mean_terminated_length": 275.0, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.09688100079074502, + "epoch": 0.194, + "frac_reward_zero_std": 0.0, + "grad_norm": NaN, + "learning_rate": 1.9488760116444964e-06, + "loss": 0.0737, + "num_tokens": 852552.0, + "reward": 3.657806873321533, + "reward_std": 2.6497535705566406, + "rewards/fitness_reward/mean": 4.058182716369629, + "rewards/fitness_reward/std": 3.0874576568603516, + "rewards/kidney_reward/mean": -0.2950325906276703, + "rewards/kidney_reward/std": 0.8756623268127441, + "rewards/length2tails_reward/mean": 0.4380313456058502, + "rewards/length2tails_reward/std": 0.47069379687309265, + "rewards/thermo_reward/mean": -0.7247352600097656, + "rewards/thermo_reward/std": 1.6496366262435913, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 268.6875, + "completions/mean_terminated_length": 268.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.05954432673752308, + "epoch": 0.196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.811093270778656, + "learning_rate": 1.9488760116444964e-06, + "loss": 0.0184, + "num_tokens": 861182.0, + "reward": 3.9953298568725586, + "reward_std": 2.4319796562194824, + "rewards/fitness_reward/mean": 4.486875057220459, + "rewards/fitness_reward/std": 2.3484222888946533, + "rewards/kidney_reward/mean": -0.2622259259223938, + "rewards/kidney_reward/std": 1.0675276517868042, + "rewards/length2tails_reward/mean": 0.37341785430908203, + "rewards/length2tails_reward/std": 0.45848578214645386, + "rewards/thermo_reward/mean": -0.907573401927948, + "rewards/thermo_reward/std": 1.5665401220321655, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 275.71875, + "completions/mean_terminated_length": 260.2903137207031, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.13262386550195515, + "epoch": 0.198, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.5504302978515625, + "learning_rate": 1.9466492601156963e-06, + "loss": 0.1024, + "num_tokens": 870037.0, + "reward": 2.7974579334259033, + "reward_std": 2.8245115280151367, + "rewards/fitness_reward/mean": 3.18468976020813, + "rewards/fitness_reward/std": 3.181276798248291, + "rewards/kidney_reward/mean": -0.3316881060600281, + "rewards/kidney_reward/std": 0.7468500137329102, + "rewards/length2tails_reward/mean": 0.2897987961769104, + "rewards/length2tails_reward/std": 0.4419374465942383, + "rewards/thermo_reward/mean": -0.587674617767334, + "rewards/thermo_reward/std": 1.3871588706970215, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 283.9375, + "completions/mean_terminated_length": 283.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12895289808511734, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.474762201309204, + "learning_rate": 1.944376370237481e-06, + "loss": 0.1547, + "num_tokens": 879155.0, + "reward": 4.534636497497559, + "reward_std": 2.668832778930664, + "rewards/fitness_reward/mean": 4.991495132446289, + "rewards/fitness_reward/std": 2.8266241550445557, + "rewards/kidney_reward/mean": -0.2184591293334961, + "rewards/kidney_reward/std": 1.0288792848587036, + "rewards/length2tails_reward/mean": 0.5330479145050049, + "rewards/length2tails_reward/std": 0.44932231307029724, + "rewards/thermo_reward/mean": -0.9617821574211121, + "rewards/thermo_reward/std": 1.8963667154312134, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08518685982562602, + "epoch": 0.202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.835224986076355, + "learning_rate": 1.9420574527872966e-06, + "loss": 0.0636, + "num_tokens": 887926.0, + "reward": 3.950529098510742, + "reward_std": 2.339932918548584, + "rewards/fitness_reward/mean": 4.407957077026367, + "rewards/fitness_reward/std": 2.659775733947754, + "rewards/kidney_reward/mean": 0.050680406391620636, + "rewards/kidney_reward/std": 0.8810163736343384, + "rewards/length2tails_reward/mean": 0.29464665055274963, + "rewards/length2tails_reward/std": 0.4334779679775238, + "rewards/thermo_reward/mean": -1.1128590106964111, + "rewards/thermo_reward/std": 1.0931978225708008, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 286.875, + "completions/mean_terminated_length": 286.875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.130730795674026, + "epoch": 0.204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2770278453826904, + "learning_rate": 1.9396926207859082e-06, + "loss": 0.138, + "num_tokens": 897138.0, + "reward": 3.412665367126465, + "reward_std": 3.999739408493042, + "rewards/fitness_reward/mean": 3.8380305767059326, + "rewards/fitness_reward/std": 3.9377808570861816, + "rewards/kidney_reward/mean": -0.5111966133117676, + "rewards/kidney_reward/std": 1.0602242946624756, + "rewards/length2tails_reward/mean": 0.5344866514205933, + "rewards/length2tails_reward/std": 0.46006661653518677, + "rewards/thermo_reward/mean": -0.6067769527435303, + "rewards/thermo_reward/std": 1.68917715549469, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 266.84375, + "completions/mean_terminated_length": 266.84375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.09578574495390058, + "epoch": 0.206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2568925619125366, + "learning_rate": 1.9372819894918914e-06, + "loss": -0.0062, + "num_tokens": 905709.0, + "reward": 4.232032299041748, + "reward_std": 2.5994482040405273, + "rewards/fitness_reward/mean": 4.540476322174072, + "rewards/fitness_reward/std": 2.9304285049438477, + "rewards/kidney_reward/mean": -0.3461866080760956, + "rewards/kidney_reward/std": 0.9004994630813599, + "rewards/length2tails_reward/mean": 0.47418013215065, + "rewards/length2tails_reward/std": 0.44722241163253784, + "rewards/thermo_reward/mean": -0.5077908635139465, + "rewards/thermo_reward/std": 1.6851264238357544, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.05232610600069165, + "epoch": 0.208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37754666805267334, + "learning_rate": 1.9348256763960142e-06, + "loss": -0.0148, + "num_tokens": 914371.0, + "reward": 3.1689558029174805, + "reward_std": 2.3899450302124023, + "rewards/fitness_reward/mean": 3.684023380279541, + "rewards/fitness_reward/std": 2.4902243614196777, + "rewards/kidney_reward/mean": -0.3063453733921051, + "rewards/kidney_reward/std": 0.674054741859436, + "rewards/length2tails_reward/mean": 0.25765612721443176, + "rewards/length2tails_reward/std": 0.423554390668869, + "rewards/thermo_reward/mean": -0.8526173830032349, + "rewards/thermo_reward/std": 1.3145358562469482, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.11395578144583851, + "epoch": 0.21, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8413267135620117, + "learning_rate": 1.9323238012155122e-06, + "loss": 0.0022, + "num_tokens": 922987.0, + "reward": 3.0292320251464844, + "reward_std": 3.132319211959839, + "rewards/fitness_reward/mean": 3.6769983768463135, + "rewards/fitness_reward/std": 3.4672458171844482, + "rewards/kidney_reward/mean": -0.5183227062225342, + "rewards/kidney_reward/std": 0.7655011415481567, + "rewards/length2tails_reward/mean": 0.44852757453918457, + "rewards/length2tails_reward/std": 0.4676968455314636, + "rewards/thermo_reward/mean": -1.001473307609558, + "rewards/thermo_reward/std": 1.6753126382827759, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 264.5625, + "completions/mean_terminated_length": 264.5625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.0989903915906325, + "epoch": 0.212, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1482203006744385, + "learning_rate": 1.929776485888251e-06, + "loss": -0.028, + "num_tokens": 931485.0, + "reward": 3.722275733947754, + "reward_std": 2.6445722579956055, + "rewards/fitness_reward/mean": 4.1574249267578125, + "rewards/fitness_reward/std": 2.748945713043213, + "rewards/kidney_reward/mean": -0.23157793283462524, + "rewards/kidney_reward/std": 0.9468393325805664, + "rewards/length2tails_reward/mean": 0.36526206135749817, + "rewards/length2tails_reward/std": 0.4607754945755005, + "rewards/thermo_reward/mean": -0.8213506937026978, + "rewards/thermo_reward/std": 1.4860751628875732, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 267.46875, + "completions/mean_terminated_length": 267.46875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 0.07739041280001402, + "epoch": 0.214, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2668060064315796, + "learning_rate": 1.9271838545667875e-06, + "loss": -0.0093, + "num_tokens": 940076.0, + "reward": 3.640249252319336, + "reward_std": 2.7999203205108643, + "rewards/fitness_reward/mean": 3.994453191757202, + "rewards/fitness_reward/std": 3.2945973873138428, + "rewards/kidney_reward/mean": -0.3617490231990814, + "rewards/kidney_reward/std": 0.8847697973251343, + "rewards/length2tails_reward/mean": 0.5171585083007812, + "rewards/length2tails_reward/std": 0.4940597116947174, + "rewards/thermo_reward/mean": -0.6052376627922058, + "rewards/thermo_reward/std": 2.0294535160064697, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1099214032292366, + "epoch": 0.216, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6436166763305664, + "learning_rate": 1.9245460336123133e-06, + "loss": 0.0649, + "num_tokens": 948884.0, + "reward": 2.887429714202881, + "reward_std": 3.679654121398926, + "rewards/fitness_reward/mean": 3.3409667015075684, + "rewards/fitness_reward/std": 3.7627742290496826, + "rewards/kidney_reward/mean": -0.4285740852355957, + "rewards/kidney_reward/std": 1.042330265045166, + "rewards/length2tails_reward/mean": 0.3566879630088806, + "rewards/length2tails_reward/std": 0.4686858654022217, + "rewards/thermo_reward/mean": -0.6568437218666077, + "rewards/thermo_reward/std": 1.446584939956665, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 261.15625, + "completions/mean_terminated_length": 261.15625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.0422589861555025, + "epoch": 0.218, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6566778421401978, + "learning_rate": 1.9218631515885003e-06, + "loss": -0.0857, + "num_tokens": 957273.0, + "reward": 3.377675771713257, + "reward_std": 2.2366034984588623, + "rewards/fitness_reward/mean": 3.9846253395080566, + "rewards/fitness_reward/std": 2.6026394367218018, + "rewards/kidney_reward/mean": -0.4901122450828552, + "rewards/kidney_reward/std": 0.698078989982605, + "rewards/length2tails_reward/mean": 0.3232583999633789, + "rewards/length2tails_reward/std": 0.4532923698425293, + "rewards/thermo_reward/mean": -0.8854160308837891, + "rewards/thermo_reward/std": 1.4513849020004272, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 266.34375, + "completions/mean_terminated_length": 266.34375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.06348483904730529, + "epoch": 0.22, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5598238706588745, + "learning_rate": 1.9191353392552343e-06, + "loss": -0.0301, + "num_tokens": 965828.0, + "reward": 3.7526564598083496, + "reward_std": 2.3632280826568604, + "rewards/fitness_reward/mean": 4.508893966674805, + "rewards/fitness_reward/std": 2.7085962295532227, + "rewards/kidney_reward/mean": -0.6108442544937134, + "rewards/kidney_reward/std": 0.7660795450210571, + "rewards/length2tails_reward/mean": 0.46491798758506775, + "rewards/length2tails_reward/std": 0.47420939803123474, + "rewards/thermo_reward/mean": -1.1340885162353516, + "rewards/thermo_reward/std": 1.5848442316055298, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 265.4375, + "completions/mean_terminated_length": 265.4375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.0810333862900734, + "epoch": 0.222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3541920185089111, + "learning_rate": 1.9163627295622395e-06, + "loss": -0.0234, + "num_tokens": 974354.0, + "reward": 3.9480810165405273, + "reward_std": 2.8221023082733154, + "rewards/fitness_reward/mean": 4.40368127822876, + "rewards/fitness_reward/std": 3.02516770362854, + "rewards/kidney_reward/mean": -0.20891384780406952, + "rewards/kidney_reward/std": 1.0420880317687988, + "rewards/length2tails_reward/mean": 0.4156648516654968, + "rewards/length2tails_reward/std": 0.43952322006225586, + "rewards/thermo_reward/mean": -0.9101189374923706, + "rewards/thermo_reward/std": 1.2393132448196411, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 267.0, + "completions/mean_terminated_length": 267.0, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.06256379652768373, + "epoch": 0.224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2238101363182068, + "learning_rate": 1.9135454576426007e-06, + "loss": -0.0033, + "num_tokens": 982930.0, + "reward": 4.236385822296143, + "reward_std": 2.577726125717163, + "rewards/fitness_reward/mean": 4.564560890197754, + "rewards/fitness_reward/std": 2.516674280166626, + "rewards/kidney_reward/mean": -0.3167581856250763, + "rewards/kidney_reward/std": 0.9142901301383972, + "rewards/length2tails_reward/mean": 0.3586871922016144, + "rewards/length2tails_reward/std": 0.43271785974502563, + "rewards/thermo_reward/mean": -0.5189354419708252, + "rewards/thermo_reward/std": 1.6014158725738525, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 259.0625, + "completions/mean_terminated_length": 259.0625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.05426312144845724, + "epoch": 0.226, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21272876858711243, + "learning_rate": 1.910683660806177e-06, + "loss": -0.1044, + "num_tokens": 991252.0, + "reward": 4.512127876281738, + "reward_std": 2.468585729598999, + "rewards/fitness_reward/mean": 4.623218536376953, + "rewards/fitness_reward/std": 2.2350494861602783, + "rewards/kidney_reward/mean": -0.07495585083961487, + "rewards/kidney_reward/std": 1.085909128189087, + "rewards/length2tails_reward/mean": 0.3621150851249695, + "rewards/length2tails_reward/std": 0.4594513177871704, + "rewards/thermo_reward/mean": -0.3282831311225891, + "rewards/thermo_reward/std": 1.6514654159545898, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 267.125, + "completions/mean_terminated_length": 267.125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.05443617841228843, + "epoch": 0.228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22862917184829712, + "learning_rate": 1.9077774785329087e-06, + "loss": -0.0068, + "num_tokens": 999832.0, + "reward": 4.045974254608154, + "reward_std": 1.8630603551864624, + "rewards/fitness_reward/mean": 4.666137218475342, + "rewards/fitness_reward/std": 2.078634738922119, + "rewards/kidney_reward/mean": -0.4568588137626648, + "rewards/kidney_reward/std": 0.887822687625885, + "rewards/length2tails_reward/mean": 0.3842611312866211, + "rewards/length2tails_reward/std": 0.46035540103912354, + "rewards/thermo_reward/mean": -0.9755982160568237, + "rewards/thermo_reward/std": 1.595231533050537, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 276.46875, + "completions/mean_terminated_length": 276.46875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.10052089486271143, + "epoch": 0.23, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.551025629043579, + "learning_rate": 1.9048270524660196e-06, + "loss": 0.0763, + "num_tokens": 1008711.0, + "reward": 3.8763861656188965, + "reward_std": 2.691915512084961, + "rewards/fitness_reward/mean": 4.211784362792969, + "rewards/fitness_reward/std": 3.2631149291992188, + "rewards/kidney_reward/mean": -0.20681864023208618, + "rewards/kidney_reward/std": 1.1009536981582642, + "rewards/length2tails_reward/mean": 0.44117099046707153, + "rewards/length2tails_reward/std": 0.4576721787452698, + "rewards/thermo_reward/mean": -0.6845625638961792, + "rewards/thermo_reward/std": 1.8933459520339966, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 277.09375, + "completions/mean_terminated_length": 277.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11264282860793173, + "epoch": 0.232, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.240277051925659, + "learning_rate": 1.9018325264051138e-06, + "loss": 0.1024, + "num_tokens": 1017610.0, + "reward": 4.128618240356445, + "reward_std": 2.4607045650482178, + "rewards/fitness_reward/mean": 4.626686096191406, + "rewards/fitness_reward/std": 2.66855525970459, + "rewards/kidney_reward/mean": -0.41873425245285034, + "rewards/kidney_reward/std": 0.8528873920440674, + "rewards/length2tails_reward/mean": 0.45648080110549927, + "rewards/length2tails_reward/std": 0.4659040570259094, + "rewards/thermo_reward/mean": -0.8056415319442749, + "rewards/thermo_reward/std": 1.8150216341018677, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 267.96875, + "completions/mean_terminated_length": 267.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.07115628337487578, + "epoch": 0.234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28056812286376953, + "learning_rate": 1.8987940462991669e-06, + "loss": -0.0076, + "num_tokens": 1026217.0, + "reward": 5.146751403808594, + "reward_std": 1.9194636344909668, + "rewards/fitness_reward/mean": 5.458418846130371, + "rewards/fitness_reward/std": 1.5903440713882446, + "rewards/kidney_reward/mean": -0.3146427869796753, + "rewards/kidney_reward/std": 0.9314286708831787, + "rewards/length2tails_reward/mean": 0.4887458086013794, + "rewards/length2tails_reward/std": 0.445161372423172, + "rewards/thermo_reward/mean": -0.5530648827552795, + "rewards/thermo_reward/std": 1.9085291624069214, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 288.03125, + "completions/mean_terminated_length": 288.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.10206129332073033, + "epoch": 0.236, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.308597087860107, + "learning_rate": 1.8957117602394128e-06, + "loss": 0.1213, + "num_tokens": 1035466.0, + "reward": 3.710597038269043, + "reward_std": 2.7380971908569336, + "rewards/fitness_reward/mean": 3.901604175567627, + "rewards/fitness_reward/std": 2.8767783641815186, + "rewards/kidney_reward/mean": -0.2230178266763687, + "rewards/kidney_reward/std": 1.0350183248519897, + "rewards/length2tails_reward/mean": 0.439441978931427, + "rewards/length2tails_reward/std": 0.4717496633529663, + "rewards/thermo_reward/mean": -0.37871700525283813, + "rewards/thermo_reward/std": 1.4095770120620728, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 272.71875, + "completions/mean_terminated_length": 272.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07676034257747233, + "epoch": 0.238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9029649496078491, + "learning_rate": 1.8925858184521255e-06, + "loss": 0.0311, + "num_tokens": 1044225.0, + "reward": 3.1038806438446045, + "reward_std": 2.6872596740722656, + "rewards/fitness_reward/mean": 3.926145315170288, + "rewards/fitness_reward/std": 3.1735939979553223, + "rewards/kidney_reward/mean": -0.5126339197158813, + "rewards/kidney_reward/std": 0.9371849298477173, + "rewards/length2tails_reward/mean": 0.5384362936019897, + "rewards/length2tails_reward/std": 0.48361003398895264, + "rewards/thermo_reward/mean": -1.401113510131836, + "rewards/thermo_reward/std": 1.7026857137680054, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 262.03125, + "completions/mean_terminated_length": 262.03125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.06442607054486871, + "epoch": 0.24, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43014267086982727, + "learning_rate": 1.8894163732912974e-06, + "loss": -0.0554, + "num_tokens": 1052642.0, + "reward": 4.679306983947754, + "reward_std": 2.9649219512939453, + "rewards/fitness_reward/mean": 4.555464744567871, + "rewards/fitness_reward/std": 2.9455044269561768, + "rewards/kidney_reward/mean": 0.28108876943588257, + "rewards/kidney_reward/std": 1.0552774667739868, + "rewards/length2tails_reward/mean": 0.4020635783672333, + "rewards/length2tails_reward/std": 0.4390629231929779, + "rewards/thermo_reward/mean": -0.23443603515625, + "rewards/thermo_reward/std": 1.6590263843536377, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 274.34375, + "completions/mean_terminated_length": 274.34375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.12703936896286905, + "epoch": 0.242, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.771202802658081, + "learning_rate": 1.8862035792312146e-06, + "loss": 0.0538, + "num_tokens": 1061453.0, + "reward": 3.4370691776275635, + "reward_std": 3.001307725906372, + "rewards/fitness_reward/mean": 3.9468846321105957, + "rewards/fitness_reward/std": 3.393200397491455, + "rewards/kidney_reward/mean": -0.46353626251220703, + "rewards/kidney_reward/std": 0.7753000259399414, + "rewards/length2tails_reward/mean": 0.4690501093864441, + "rewards/length2tails_reward/std": 0.4708133935928345, + "rewards/thermo_reward/mean": -0.7906193733215332, + "rewards/thermo_reward/std": 1.7162280082702637, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 264.1875, + "completions/mean_terminated_length": 264.1875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.05585645651444793, + "epoch": 0.244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5052928924560547, + "learning_rate": 1.8829475928589268e-06, + "loss": -0.0554, + "num_tokens": 1069939.0, + "reward": 3.7392311096191406, + "reward_std": 2.379016399383545, + "rewards/fitness_reward/mean": 4.264631271362305, + "rewards/fitness_reward/std": 2.363891839981079, + "rewards/kidney_reward/mean": -0.2556886672973633, + "rewards/kidney_reward/std": 0.9878360629081726, + "rewards/length2tails_reward/mean": 0.39914143085479736, + "rewards/length2tails_reward/std": 0.47185075283050537, + "rewards/thermo_reward/mean": -0.9946828484535217, + "rewards/thermo_reward/std": 1.5393421649932861, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 278.78125, + "completions/mean_terminated_length": 278.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06387731153517962, + "epoch": 0.246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9780254364013672, + "learning_rate": 1.8796485728666165e-06, + "loss": 0.1166, + "num_tokens": 1078892.0, + "reward": 3.3900699615478516, + "reward_std": 2.7406327724456787, + "rewards/fitness_reward/mean": 3.765007495880127, + "rewards/fitness_reward/std": 2.5442514419555664, + "rewards/kidney_reward/mean": -0.046911612153053284, + "rewards/kidney_reward/std": 0.9505102634429932, + "rewards/length2tails_reward/mean": 0.27233248949050903, + "rewards/length2tails_reward/std": 0.4196857810020447, + "rewards/thermo_reward/mean": -0.8391291499137878, + "rewards/thermo_reward/std": 1.3835790157318115, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 263.4375, + "completions/mean_terminated_length": 263.4375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.13032073702197522, + "epoch": 0.248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5727119445800781, + "learning_rate": 1.8763066800438634e-06, + "loss": -0.0356, + "num_tokens": 1087354.0, + "reward": 3.966728448867798, + "reward_std": 3.162716865539551, + "rewards/fitness_reward/mean": 4.33012580871582, + "rewards/fitness_reward/std": 3.2471702098846436, + "rewards/kidney_reward/mean": 0.0831863209605217, + "rewards/kidney_reward/std": 1.002663254737854, + "rewards/length2tails_reward/mean": 0.3769824504852295, + "rewards/length2tails_reward/std": 0.4535328149795532, + "rewards/thermo_reward/mean": -0.9984723329544067, + "rewards/thermo_reward/std": 1.4239492416381836, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 282.9375, + "completions/mean_terminated_length": 282.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09651394619140774, + "epoch": 0.25, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.227329969406128, + "learning_rate": 1.8729220772698095e-06, + "loss": 0.0957, + "num_tokens": 1096440.0, + "reward": 2.99495005607605, + "reward_std": 3.3334567546844482, + "rewards/fitness_reward/mean": 3.3388710021972656, + "rewards/fitness_reward/std": 3.4859747886657715, + "rewards/kidney_reward/mean": -0.3218488097190857, + "rewards/kidney_reward/std": 0.9957481622695923, + "rewards/length2tails_reward/mean": 0.4028223156929016, + "rewards/length2tails_reward/std": 0.45646458864212036, + "rewards/thermo_reward/mean": -0.5674041509628296, + "rewards/thermo_reward/std": 1.813733696937561, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.10553285828791559, + "epoch": 0.252, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9625844359397888, + "learning_rate": 1.869494929505219e-06, + "loss": 0.0207, + "num_tokens": 1105150.0, + "reward": 3.427802085876465, + "reward_std": 3.8064815998077393, + "rewards/fitness_reward/mean": 3.734769344329834, + "rewards/fitness_reward/std": 4.140002727508545, + "rewards/kidney_reward/mean": -0.16808955371379852, + "rewards/kidney_reward/std": 0.9854453802108765, + "rewards/length2tails_reward/mean": 0.5585049986839294, + "rewards/length2tails_reward/std": 0.47287943959236145, + "rewards/thermo_reward/mean": -0.7250969409942627, + "rewards/thermo_reward/std": 1.93299400806427, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08411063288804144, + "epoch": 0.254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2926709651947021, + "learning_rate": 1.8660254037844386e-06, + "loss": 0.0231, + "num_tokens": 1113843.0, + "reward": 3.6425962448120117, + "reward_std": 3.0378100872039795, + "rewards/fitness_reward/mean": 4.0191650390625, + "rewards/fitness_reward/std": 2.870126724243164, + "rewards/kidney_reward/mean": -0.3228492736816406, + "rewards/kidney_reward/std": 0.7792383432388306, + "rewards/length2tails_reward/mean": 0.3848269581794739, + "rewards/length2tails_reward/std": 0.45583122968673706, + "rewards/thermo_reward/mean": -0.6227012872695923, + "rewards/thermo_reward/std": 1.4705400466918945, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 268.21875, + "completions/mean_terminated_length": 268.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06420792755670846, + "epoch": 0.256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7759255766868591, + "learning_rate": 1.8625136692072574e-06, + "loss": -0.0121, + "num_tokens": 1122458.0, + "reward": 3.8680875301361084, + "reward_std": 2.33729887008667, + "rewards/fitness_reward/mean": 4.139684677124023, + "rewards/fitness_reward/std": 2.4506237506866455, + "rewards/kidney_reward/mean": -0.3782632350921631, + "rewards/kidney_reward/std": 0.7467220425605774, + "rewards/length2tails_reward/mean": 0.386510968208313, + "rewards/length2tails_reward/std": 0.4685826301574707, + "rewards/thermo_reward/mean": -0.3581863045692444, + "rewards/thermo_reward/std": 1.5898200273513794, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 267.75, + "completions/mean_terminated_length": 267.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06495336792431772, + "epoch": 0.258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24202904105186462, + "learning_rate": 1.8589598969306644e-06, + "loss": -0.0058, + "num_tokens": 1131058.0, + "reward": 4.778224945068359, + "reward_std": 1.8929619789123535, + "rewards/fitness_reward/mean": 5.180815696716309, + "rewards/fitness_reward/std": 2.0846951007843018, + "rewards/kidney_reward/mean": -0.29253044724464417, + "rewards/kidney_reward/std": 1.0237658023834229, + "rewards/length2tails_reward/mean": 0.3840765953063965, + "rewards/length2tails_reward/std": 0.4389142692089081, + "rewards/thermo_reward/mean": -0.7046886682510376, + "rewards/thermo_reward/std": 1.6220309734344482, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09890062315389514, + "epoch": 0.26, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5762393474578857, + "learning_rate": 1.8553642601605066e-06, + "loss": 0.0766, + "num_tokens": 1139938.0, + "reward": 4.35146951675415, + "reward_std": 3.159916877746582, + "rewards/fitness_reward/mean": 4.681078910827637, + "rewards/fitness_reward/std": 3.1895546913146973, + "rewards/kidney_reward/mean": -0.3678491711616516, + "rewards/kidney_reward/std": 1.1440036296844482, + "rewards/length2tails_reward/mean": 0.49155309796333313, + "rewards/length2tails_reward/std": 0.44114190340042114, + "rewards/thermo_reward/mean": -0.5371465086936951, + "rewards/thermo_reward/std": 1.8674601316452026, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 277.125, + "completions/mean_terminated_length": 261.7419128417969, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.10243750689551234, + "epoch": 0.262, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1447086334228516, + "learning_rate": 1.8517269341430474e-06, + "loss": -0.1169, + "num_tokens": 1148838.0, + "reward": 4.371088027954102, + "reward_std": 3.315836191177368, + "rewards/fitness_reward/mean": 4.446979999542236, + "rewards/fitness_reward/std": 3.301693916320801, + "rewards/kidney_reward/mean": 0.21159544587135315, + "rewards/kidney_reward/std": 1.2276256084442139, + "rewards/length2tails_reward/mean": 0.4857953190803528, + "rewards/length2tails_reward/std": 0.47748568654060364, + "rewards/thermo_reward/mean": -0.6062768697738647, + "rewards/thermo_reward/std": 1.8472959995269775, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 266.25, + "completions/mean_terminated_length": 266.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.042758471332490444, + "epoch": 0.264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35708656907081604, + "learning_rate": 1.8480480961564257e-06, + "loss": -0.0114, + "num_tokens": 1157390.0, + "reward": 4.053591251373291, + "reward_std": 1.8370122909545898, + "rewards/fitness_reward/mean": 4.428528308868408, + "rewards/fitness_reward/std": 1.5903440713882446, + "rewards/kidney_reward/mean": -0.29007411003112793, + "rewards/kidney_reward/std": 0.7652667164802551, + "rewards/length2tails_reward/mean": 0.23366816341876984, + "rewards/length2tails_reward/std": 0.39625921845436096, + "rewards/thermo_reward/mean": -0.5766348838806152, + "rewards/thermo_reward/std": 1.304244041442871, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 268.78125, + "completions/mean_terminated_length": 268.78125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.15004280488938093, + "epoch": 0.266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6409659385681152, + "learning_rate": 1.844327925502015e-06, + "loss": -0.0015, + "num_tokens": 1166023.0, + "reward": 3.3718714714050293, + "reward_std": 4.05106258392334, + "rewards/fitness_reward/mean": 3.524176836013794, + "rewards/fitness_reward/std": 4.102492332458496, + "rewards/kidney_reward/mean": -0.29524558782577515, + "rewards/kidney_reward/std": 0.9443924427032471, + "rewards/length2tails_reward/mean": 0.5598483681678772, + "rewards/length2tails_reward/std": 0.4729120135307312, + "rewards/thermo_reward/mean": -0.289289653301239, + "rewards/thermo_reward/std": 1.761705994606018, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 277.25, + "completions/mean_terminated_length": 277.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08702222676947713, + "epoch": 0.268, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.362683892250061, + "learning_rate": 1.8405666034956842e-06, + "loss": 0.0883, + "num_tokens": 1174927.0, + "reward": 4.026025295257568, + "reward_std": 2.786008834838867, + "rewards/fitness_reward/mean": 4.502394676208496, + "rewards/fitness_reward/std": 3.0564589500427246, + "rewards/kidney_reward/mean": -0.4614133834838867, + "rewards/kidney_reward/std": 1.0194756984710693, + "rewards/length2tails_reward/mean": 0.4920497536659241, + "rewards/length2tails_reward/std": 0.47884926199913025, + "rewards/thermo_reward/mean": -0.7373506426811218, + "rewards/thermo_reward/std": 1.7863677740097046, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 267.5, + "completions/mean_terminated_length": 267.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06600146123673767, + "epoch": 0.27, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3214907646179199, + "learning_rate": 1.8367643134589616e-06, + "loss": -0.009, + "num_tokens": 1183519.0, + "reward": 4.572535514831543, + "reward_std": 1.9083198308944702, + "rewards/fitness_reward/mean": 5.0464630126953125, + "rewards/fitness_reward/std": 1.6709182262420654, + "rewards/kidney_reward/mean": -0.2989814281463623, + "rewards/kidney_reward/std": 0.8420076370239258, + "rewards/length2tails_reward/mean": 0.4243019223213196, + "rewards/length2tails_reward/std": 0.45737841725349426, + "rewards/thermo_reward/mean": -0.8610237836837769, + "rewards/thermo_reward/std": 1.6364610195159912, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 281.875, + "completions/mean_terminated_length": 281.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.139061838388443, + "epoch": 0.272, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1695799827575684, + "learning_rate": 1.8329212407100993e-06, + "loss": 0.1079, + "num_tokens": 1192571.0, + "reward": 3.5707364082336426, + "reward_std": 2.9870543479919434, + "rewards/fitness_reward/mean": 3.8914880752563477, + "rewards/fitness_reward/std": 3.2452211380004883, + "rewards/kidney_reward/mean": -0.30540257692337036, + "rewards/kidney_reward/std": 0.9802507162094116, + "rewards/length2tails_reward/mean": 0.4223281145095825, + "rewards/length2tails_reward/std": 0.48747193813323975, + "rewards/thermo_reward/mean": -0.5472647547721863, + "rewards/thermo_reward/std": 1.4930212497711182, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 274.875, + "completions/mean_terminated_length": 274.875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.1304667112417519, + "epoch": 0.274, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.339874029159546, + "learning_rate": 1.8290375725550415e-06, + "loss": 0.0335, + "num_tokens": 1201399.0, + "reward": 4.265954494476318, + "reward_std": 3.219212293624878, + "rewards/fitness_reward/mean": 4.625532150268555, + "rewards/fitness_reward/std": 3.3417165279388428, + "rewards/kidney_reward/mean": -0.1432976871728897, + "rewards/kidney_reward/std": 0.9689578413963318, + "rewards/length2tails_reward/mean": 0.6148507595062256, + "rewards/length2tails_reward/std": 0.451657772064209, + "rewards/thermo_reward/mean": -0.8832824230194092, + "rewards/thermo_reward/std": 1.83414626121521, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 265.28125, + "completions/mean_terminated_length": 265.28125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.0816122842952609, + "epoch": 0.276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1885217428207397, + "learning_rate": 1.825113498278295e-06, + "loss": -0.0397, + "num_tokens": 1209920.0, + "reward": 4.002777576446533, + "reward_std": 2.7675578594207764, + "rewards/fitness_reward/mean": 4.709897994995117, + "rewards/fitness_reward/std": 2.7510030269622803, + "rewards/kidney_reward/mean": -0.6250013113021851, + "rewards/kidney_reward/std": 0.8238852024078369, + "rewards/length2tails_reward/mean": 0.4693373739719391, + "rewards/length2tails_reward/std": 0.46592721343040466, + "rewards/thermo_reward/mean": -1.0239073038101196, + "rewards/thermo_reward/std": 1.7034920454025269, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 267.46875, + "completions/mean_terminated_length": 267.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06394899217411876, + "epoch": 0.278, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2774324417114258, + "learning_rate": 1.821149209133704e-06, + "loss": -0.0098, + "num_tokens": 1218511.0, + "reward": 3.903625249862671, + "reward_std": 2.363471031188965, + "rewards/fitness_reward/mean": 4.4604902267456055, + "rewards/fitness_reward/std": 2.4980661869049072, + "rewards/kidney_reward/mean": -0.4132443368434906, + "rewards/kidney_reward/std": 0.9047765731811523, + "rewards/length2tails_reward/mean": 0.3875090479850769, + "rewards/length2tails_reward/std": 0.4659252464771271, + "rewards/thermo_reward/mean": -0.89424067735672, + "rewards/thermo_reward/std": 1.5600018501281738, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.5625, + "completions/mean_terminated_length": 267.5625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.07024233182892203, + "epoch": 0.28, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.926374614238739, + "learning_rate": 1.8171448983351283e-06, + "loss": -0.0145, + "num_tokens": 1227105.0, + "reward": 3.698070764541626, + "reward_std": 3.252227783203125, + "rewards/fitness_reward/mean": 4.266557216644287, + "rewards/fitness_reward/std": 3.184821605682373, + "rewards/kidney_reward/mean": -0.3036004900932312, + "rewards/kidney_reward/std": 0.8457627296447754, + "rewards/length2tails_reward/mean": 0.45677104592323303, + "rewards/length2tails_reward/std": 0.4728664755821228, + "rewards/thermo_reward/mean": -1.061758279800415, + "rewards/thermo_reward/std": 1.7555971145629883, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.07606290956027806, + "epoch": 0.282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6766843795776367, + "learning_rate": 1.8131007610470274e-06, + "loss": 0.0345, + "num_tokens": 1235833.0, + "reward": 3.8650505542755127, + "reward_std": 2.416300058364868, + "rewards/fitness_reward/mean": 4.350872039794922, + "rewards/fitness_reward/std": 2.4987759590148926, + "rewards/kidney_reward/mean": -0.1330445110797882, + "rewards/kidney_reward/std": 1.0574065446853638, + "rewards/length2tails_reward/mean": 0.4554171562194824, + "rewards/length2tails_reward/std": 0.472525954246521, + "rewards/thermo_reward/mean": -1.0663063526153564, + "rewards/thermo_reward/std": 1.5161410570144653, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 268.8125, + "completions/mean_terminated_length": 268.8125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08262498816475272, + "epoch": 0.284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5688827633857727, + "learning_rate": 1.8090169943749474e-06, + "loss": -0.0083, + "num_tokens": 1244467.0, + "reward": 3.845224380493164, + "reward_std": 3.0756232738494873, + "rewards/fitness_reward/mean": 4.202630996704102, + "rewards/fitness_reward/std": 3.3465781211853027, + "rewards/kidney_reward/mean": -0.0376429483294487, + "rewards/kidney_reward/std": 1.187543272972107, + "rewards/length2tails_reward/mean": 0.5421406626701355, + "rewards/length2tails_reward/std": 0.4655161201953888, + "rewards/thermo_reward/mean": -0.9482405185699463, + "rewards/thermo_reward/std": 1.8914945125579834, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 267.75, + "completions/mean_terminated_length": 267.75, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.09104419802315533, + "epoch": 0.286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3129907846450806, + "learning_rate": 1.804893797355914e-06, + "loss": -0.0001, + "num_tokens": 1253067.0, + "reward": 3.577446937561035, + "reward_std": 3.234891891479492, + "rewards/fitness_reward/mean": 4.251026153564453, + "rewards/fitness_reward/std": 3.157855749130249, + "rewards/kidney_reward/mean": -0.5096219182014465, + "rewards/kidney_reward/std": 0.8204793334007263, + "rewards/length2tails_reward/mean": 0.4574005901813507, + "rewards/length2tails_reward/std": 0.46461042761802673, + "rewards/thermo_reward/mean": -1.066237449645996, + "rewards/thermo_reward/std": 1.6232463121414185, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 276.25, + "completions/mean_terminated_length": 276.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12595916108693928, + "epoch": 0.288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8075847625732422, + "learning_rate": 1.8007313709487333e-06, + "loss": 0.0794, + "num_tokens": 1261939.0, + "reward": 3.491788387298584, + "reward_std": 2.7610220909118652, + "rewards/fitness_reward/mean": 3.9053263664245605, + "rewards/fitness_reward/std": 3.2118325233459473, + "rewards/kidney_reward/mean": -0.27789413928985596, + "rewards/kidney_reward/std": 0.9338050484657288, + "rewards/length2tails_reward/mean": 0.3519634008407593, + "rewards/length2tails_reward/std": 0.45090824365615845, + "rewards/thermo_reward/mean": -0.7251632213592529, + "rewards/thermo_reward/std": 1.7529805898666382, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 278.8125, + "completions/mean_terminated_length": 278.8125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.09955365816131234, + "epoch": 0.29, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4168295860290527, + "learning_rate": 1.7965299180241961e-06, + "loss": 0.1004, + "num_tokens": 1270893.0, + "reward": 4.013087272644043, + "reward_std": 3.1201953887939453, + "rewards/fitness_reward/mean": 4.355756759643555, + "rewards/fitness_reward/std": 3.185384750366211, + "rewards/kidney_reward/mean": -0.24855157732963562, + "rewards/kidney_reward/std": 1.0533267259597778, + "rewards/length2tails_reward/mean": 0.4537443518638611, + "rewards/length2tails_reward/std": 0.4465175271034241, + "rewards/thermo_reward/mean": -0.6636590957641602, + "rewards/thermo_reward/std": 1.7202668190002441, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 262.1875, + "completions/mean_terminated_length": 262.1875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.08948546904139221, + "epoch": 0.292, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4365017414093018, + "learning_rate": 1.7922896433551906e-06, + "loss": -0.068, + "num_tokens": 1279315.0, + "reward": 4.147869110107422, + "reward_std": 2.6149790287017822, + "rewards/fitness_reward/mean": 4.61093282699585, + "rewards/fitness_reward/std": 3.051393985748291, + "rewards/kidney_reward/mean": -0.5923123359680176, + "rewards/kidney_reward/std": 0.7838603258132935, + "rewards/length2tails_reward/mean": 0.4657912850379944, + "rewards/length2tails_reward/std": 0.4592147767543793, + "rewards/thermo_reward/mean": -0.5667106509208679, + "rewards/thermo_reward/std": 1.8495798110961914, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 263.40625, + "completions/mean_terminated_length": 263.40625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.07509045209735632, + "epoch": 0.294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3135392665863037, + "learning_rate": 1.7880107536067217e-06, + "loss": -0.0448, + "num_tokens": 1287776.0, + "reward": 4.450174331665039, + "reward_std": 3.310900926589966, + "rewards/fitness_reward/mean": 4.654674530029297, + "rewards/fitness_reward/std": 2.906294345855713, + "rewards/kidney_reward/mean": -0.15413539111614227, + "rewards/kidney_reward/std": 1.2075201272964478, + "rewards/length2tails_reward/mean": 0.4313182830810547, + "rewards/length2tails_reward/std": 0.43745866417884827, + "rewards/thermo_reward/mean": -0.47052454948425293, + "rewards/thermo_reward/std": 1.5457390546798706, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 274.78125, + "completions/mean_terminated_length": 274.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08302459074184299, + "epoch": 0.296, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.052696704864502, + "learning_rate": 1.7836934573258397e-06, + "loss": 0.081, + "num_tokens": 1296601.0, + "reward": 3.958801746368408, + "reward_std": 2.6489288806915283, + "rewards/fitness_reward/mean": 4.402061462402344, + "rewards/fitness_reward/std": 2.685471773147583, + "rewards/kidney_reward/mean": -0.41507062315940857, + "rewards/kidney_reward/std": 0.9083747863769531, + "rewards/length2tails_reward/mean": 0.41056132316589355, + "rewards/length2tails_reward/std": 0.4427841007709503, + "rewards/thermo_reward/mean": -0.6767306923866272, + "rewards/thermo_reward/std": 1.6160317659378052, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 269.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09352858271449804, + "epoch": 0.298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9183940291404724, + "learning_rate": 1.7793379649314742e-06, + "loss": 0.0026, + "num_tokens": 1305248.0, + "reward": 4.4203948974609375, + "reward_std": 3.2280428409576416, + "rewards/fitness_reward/mean": 4.428203105926514, + "rewards/fitness_reward/std": 2.9869213104248047, + "rewards/kidney_reward/mean": -0.04570910334587097, + "rewards/kidney_reward/std": 1.0263012647628784, + "rewards/length2tails_reward/mean": 0.49454542994499207, + "rewards/length2tails_reward/std": 0.4444493353366852, + "rewards/thermo_reward/mean": -0.21718043088912964, + "rewards/thermo_reward/std": 1.7211302518844604, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 268.78125, + "completions/mean_terminated_length": 268.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.06938954535871744, + "epoch": 0.3, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5907869935035706, + "learning_rate": 1.7749444887041795e-06, + "loss": -0.0089, + "num_tokens": 1313881.0, + "reward": 4.830990791320801, + "reward_std": 2.054767608642578, + "rewards/fitness_reward/mean": 5.171717166900635, + "rewards/fitness_reward/std": 2.118720531463623, + "rewards/kidney_reward/mean": -0.4066958427429199, + "rewards/kidney_reward/std": 1.0702879428863525, + "rewards/length2tails_reward/mean": 0.4780086874961853, + "rewards/length2tails_reward/std": 0.4582548439502716, + "rewards/thermo_reward/mean": -0.5137604475021362, + "rewards/thermo_reward/std": 1.755508542060852, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 266.9375, + "completions/mean_terminated_length": 266.9375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.08493705908767879, + "epoch": 0.302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0603394508361816, + "learning_rate": 1.7705132427757892e-06, + "loss": -0.0185, + "num_tokens": 1322455.0, + "reward": 4.334216117858887, + "reward_std": 2.548267126083374, + "rewards/fitness_reward/mean": 4.720700263977051, + "rewards/fitness_reward/std": 2.269113302230835, + "rewards/kidney_reward/mean": -0.40105870366096497, + "rewards/kidney_reward/std": 1.1038024425506592, + "rewards/length2tails_reward/mean": 0.4668967127799988, + "rewards/length2tails_reward/std": 0.43700388073921204, + "rewards/thermo_reward/mean": -0.6053584814071655, + "rewards/thermo_reward/std": 1.6193071603775024, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08874433115124702, + "epoch": 0.304, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35046184062957764, + "learning_rate": 1.766044443118978e-06, + "loss": -0.0129, + "num_tokens": 1331245.0, + "reward": 4.575962066650391, + "reward_std": 2.263967752456665, + "rewards/fitness_reward/mean": 5.18062162399292, + "rewards/fitness_reward/std": 2.085413932800293, + "rewards/kidney_reward/mean": -0.4388875961303711, + "rewards/kidney_reward/std": 0.8940988183021545, + "rewards/length2tails_reward/mean": 0.6060910224914551, + "rewards/length2tails_reward/std": 0.45901593565940857, + "rewards/thermo_reward/mean": -1.0734763145446777, + "rewards/thermo_reward/std": 1.7066905498504639, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 267.75, + "completions/mean_terminated_length": 267.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.06172634055837989, + "epoch": 0.306, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3126675486564636, + "learning_rate": 1.7615383075367368e-06, + "loss": -0.0105, + "num_tokens": 1339845.0, + "reward": 4.250274658203125, + "reward_std": 2.5619237422943115, + "rewards/fitness_reward/mean": 4.572265625, + "rewards/fitness_reward/std": 2.491471529006958, + "rewards/kidney_reward/mean": -0.27898108959198, + "rewards/kidney_reward/std": 0.7466369271278381, + "rewards/length2tails_reward/mean": 0.35668960213661194, + "rewards/length2tails_reward/std": 0.44767114520072937, + "rewards/thermo_reward/mean": -0.5433446764945984, + "rewards/thermo_reward/std": 1.584446907043457, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 267.71875, + "completions/mean_terminated_length": 267.71875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.09085593721829355, + "epoch": 0.308, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.448189616203308, + "learning_rate": 1.7569950556517563e-06, + "loss": -0.015, + "num_tokens": 1348444.0, + "reward": 4.617559432983398, + "reward_std": 2.328192710876465, + "rewards/fitness_reward/mean": 5.206052780151367, + "rewards/fitness_reward/std": 2.386223793029785, + "rewards/kidney_reward/mean": -0.29924219846725464, + "rewards/kidney_reward/std": 0.9480889439582825, + "rewards/length2tails_reward/mean": 0.5548272132873535, + "rewards/length2tails_reward/std": 0.45655137300491333, + "rewards/thermo_reward/mean": -1.1551580429077148, + "rewards/thermo_reward/std": 1.9274123907089233, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 267.6875, + "completions/mean_terminated_length": 267.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.05919750058092177, + "epoch": 0.31, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24001403152942657, + "learning_rate": 1.7524149088957242e-06, + "loss": -0.006, + "num_tokens": 1357042.0, + "reward": 4.215601444244385, + "reward_std": 1.6457915306091309, + "rewards/fitness_reward/mean": 5.0464630126953125, + "rewards/fitness_reward/std": 1.6709182262420654, + "rewards/kidney_reward/mean": -0.6109973192214966, + "rewards/kidney_reward/std": 0.6350767016410828, + "rewards/length2tails_reward/mean": 0.4455958604812622, + "rewards/length2tails_reward/std": 0.45376908779144287, + "rewards/thermo_reward/mean": -1.2735230922698975, + "rewards/thermo_reward/std": 1.7035574913024902, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 266.75, + "completions/mean_terminated_length": 266.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.048335404484532773, + "epoch": 0.312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4476269781589508, + "learning_rate": 1.747798090498532e-06, + "loss": -0.006, + "num_tokens": 1365610.0, + "reward": 3.9586968421936035, + "reward_std": 2.452965021133423, + "rewards/fitness_reward/mean": 4.494963645935059, + "rewards/fitness_reward/std": 2.396350145339966, + "rewards/kidney_reward/mean": -0.33061426877975464, + "rewards/kidney_reward/std": 0.932668924331665, + "rewards/length2tails_reward/mean": 0.33582741022109985, + "rewards/length2tails_reward/std": 0.45137956738471985, + "rewards/thermo_reward/mean": -0.9098325371742249, + "rewards/thermo_reward/std": 1.6104899644851685, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 289.40625, + "completions/mean_terminated_length": 289.40625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.1205522520467639, + "epoch": 0.314, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.124131917953491, + "learning_rate": 1.743144825477394e-06, + "loss": 0.151, + "num_tokens": 1374903.0, + "reward": 3.7194204330444336, + "reward_std": 3.6462106704711914, + "rewards/fitness_reward/mean": 3.876713752746582, + "rewards/fitness_reward/std": 3.5486321449279785, + "rewards/kidney_reward/mean": -0.2034071534872055, + "rewards/kidney_reward/std": 1.0929036140441895, + "rewards/length2tails_reward/mean": 0.5232511758804321, + "rewards/length2tails_reward/std": 0.4814155101776123, + "rewards/thermo_reward/mean": -0.37280526757240295, + "rewards/thermo_reward/std": 1.645125389099121, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 280.0, + "completions/mean_terminated_length": 280.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13657673075795174, + "epoch": 0.316, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0489912033081055, + "learning_rate": 1.738455340625884e-06, + "loss": 0.1117, + "num_tokens": 1383895.0, + "reward": 3.96601939201355, + "reward_std": 3.3946123123168945, + "rewards/fitness_reward/mean": 4.056211471557617, + "rewards/fitness_reward/std": 3.094337224960327, + "rewards/kidney_reward/mean": -0.0832638144493103, + "rewards/kidney_reward/std": 1.1776015758514404, + "rewards/length2tails_reward/mean": 0.4087567627429962, + "rewards/length2tails_reward/std": 0.4787772297859192, + "rewards/thermo_reward/mean": -0.3014984726905823, + "rewards/thermo_reward/std": 1.4508898258209229, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 268.1875, + "completions/mean_terminated_length": 268.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.06550772534683347, + "epoch": 0.318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18433576822280884, + "learning_rate": 1.7337298645028762e-06, + "loss": -0.0032, + "num_tokens": 1392509.0, + "reward": 4.696930885314941, + "reward_std": 2.494917392730713, + "rewards/fitness_reward/mean": 5.274045944213867, + "rewards/fitness_reward/std": 2.107184410095215, + "rewards/kidney_reward/mean": -0.14291353523731232, + "rewards/kidney_reward/std": 1.0486446619033813, + "rewards/length2tails_reward/mean": 0.46765953302383423, + "rewards/length2tails_reward/std": 0.4551301896572113, + "rewards/thermo_reward/mean": -1.2451454401016235, + "rewards/thermo_reward/std": 1.6586072444915771, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 280.125, + "completions/mean_terminated_length": 280.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.1450829952955246, + "epoch": 0.32, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8760439157485962, + "learning_rate": 1.7289686274214115e-06, + "loss": 0.0762, + "num_tokens": 1401505.0, + "reward": 3.5760703086853027, + "reward_std": 3.456336498260498, + "rewards/fitness_reward/mean": 3.7789759635925293, + "rewards/fitness_reward/std": 3.8132376670837402, + "rewards/kidney_reward/mean": -0.31448692083358765, + "rewards/kidney_reward/std": 1.1399399042129517, + "rewards/length2tails_reward/mean": 0.5606054663658142, + "rewards/length2tails_reward/std": 0.44976624846458435, + "rewards/thermo_reward/mean": -0.37162667512893677, + "rewards/thermo_reward/std": 1.8894951343536377, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 276.4375, + "completions/mean_terminated_length": 276.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.08699913625605404, + "epoch": 0.322, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078630208969116, + "learning_rate": 1.7241718614374676e-06, + "loss": 0.0915, + "num_tokens": 1410383.0, + "reward": 4.635051727294922, + "reward_std": 2.282209873199463, + "rewards/fitness_reward/mean": 4.993934631347656, + "rewards/fitness_reward/std": 2.4263358116149902, + "rewards/kidney_reward/mean": -0.27896836400032043, + "rewards/kidney_reward/std": 1.1161572933197021, + "rewards/length2tails_reward/mean": 0.5413184762001038, + "rewards/length2tails_reward/std": 0.4588867127895355, + "rewards/thermo_reward/mean": -0.7094570398330688, + "rewards/thermo_reward/std": 1.8480961322784424, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 277.53125, + "completions/mean_terminated_length": 277.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11324500991031528, + "epoch": 0.324, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1256017684936523, + "learning_rate": 1.719339800338651e-06, + "loss": 0.0631, + "num_tokens": 1419296.0, + "reward": 3.5607268810272217, + "reward_std": 3.390486240386963, + "rewards/fitness_reward/mean": 3.993180274963379, + "rewards/fitness_reward/std": 3.614488124847412, + "rewards/kidney_reward/mean": -0.2305232584476471, + "rewards/kidney_reward/std": 1.0963412523269653, + "rewards/length2tails_reward/mean": 0.5686094760894775, + "rewards/length2tails_reward/std": 0.4640563726425171, + "rewards/thermo_reward/mean": -0.9186879396438599, + "rewards/thermo_reward/std": 1.859251856803894, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.09377507958561182, + "epoch": 0.326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5230877995491028, + "learning_rate": 1.7144726796328032e-06, + "loss": -0.0152, + "num_tokens": 1427960.0, + "reward": 5.218613624572754, + "reward_std": 2.639878273010254, + "rewards/fitness_reward/mean": 4.980596542358398, + "rewards/fitness_reward/std": 2.531390428543091, + "rewards/kidney_reward/mean": 0.03574337065219879, + "rewards/kidney_reward/std": 1.0980381965637207, + "rewards/length2tails_reward/mean": 0.499345064163208, + "rewards/length2tails_reward/std": 0.45680156350135803, + "rewards/thermo_reward/mean": 0.19061768054962158, + "rewards/thermo_reward/std": 1.5000156164169312, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.4375, + "completions/mean_terminated_length": 267.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.0813692519441247, + "epoch": 0.328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6022782325744629, + "learning_rate": 1.7095707365365209e-06, + "loss": -0.0071, + "num_tokens": 1436550.0, + "reward": 4.692296981811523, + "reward_std": 2.3082761764526367, + "rewards/fitness_reward/mean": 4.976771831512451, + "rewards/fitness_reward/std": 2.091456174850464, + "rewards/kidney_reward/mean": -0.1753438264131546, + "rewards/kidney_reward/std": 1.1265095472335815, + "rewards/length2tails_reward/mean": 0.45045024156570435, + "rewards/length2tails_reward/std": 0.4337252676486969, + "rewards/thermo_reward/mean": -0.618831217288971, + "rewards/thermo_reward/std": 1.7613445520401, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 266.46875, + "completions/mean_terminated_length": 266.46875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.052766332402825356, + "epoch": 0.33, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45419779419898987, + "learning_rate": 1.7046342099635947e-06, + "loss": -0.0043, + "num_tokens": 1445109.0, + "reward": 3.656085252761841, + "reward_std": 2.750291109085083, + "rewards/fitness_reward/mean": 4.10206413269043, + "rewards/fitness_reward/std": 2.6468820571899414, + "rewards/kidney_reward/mean": -0.2564317584037781, + "rewards/kidney_reward/std": 1.0081008672714233, + "rewards/length2tails_reward/mean": 0.31663069128990173, + "rewards/length2tails_reward/std": 0.41682401299476624, + "rewards/thermo_reward/mean": -0.7938418984413147, + "rewards/thermo_reward/std": 1.628943681716919, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.14256688253954053, + "epoch": 0.332, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8583552837371826, + "learning_rate": 1.6996633405133653e-06, + "loss": 0.0395, + "num_tokens": 1453807.0, + "reward": 3.794830799102783, + "reward_std": 3.588675022125244, + "rewards/fitness_reward/mean": 3.962114095687866, + "rewards/fitness_reward/std": 3.6419920921325684, + "rewards/kidney_reward/mean": -0.18258120119571686, + "rewards/kidney_reward/std": 1.058857798576355, + "rewards/length2tails_reward/mean": 0.43951690196990967, + "rewards/length2tails_reward/std": 0.4417087137699127, + "rewards/thermo_reward/mean": -0.37174350023269653, + "rewards/thermo_reward/std": 1.7920690774917603, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 266.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.0489120373968035, + "epoch": 0.334, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3171038031578064, + "learning_rate": 1.6946583704589972e-06, + "loss": -0.0062, + "num_tokens": 1462379.0, + "reward": 4.280658721923828, + "reward_std": 1.986229419708252, + "rewards/fitness_reward/mean": 4.737495422363281, + "rewards/fitness_reward/std": 1.6610602140426636, + "rewards/kidney_reward/mean": -0.3628132939338684, + "rewards/kidney_reward/std": 0.8554661870002747, + "rewards/length2tails_reward/mean": 0.30981114506721497, + "rewards/length2tails_reward/std": 0.4345083236694336, + "rewards/thermo_reward/mean": -0.7057662010192871, + "rewards/thermo_reward/std": 1.439083456993103, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 277.875, + "completions/mean_terminated_length": 277.875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.09779051598161459, + "epoch": 0.336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.927237868309021, + "learning_rate": 1.6896195437356697e-06, + "loss": 0.0958, + "num_tokens": 1471303.0, + "reward": 4.192678451538086, + "reward_std": 2.79734206199646, + "rewards/fitness_reward/mean": 4.619560241699219, + "rewards/fitness_reward/std": 2.6883301734924316, + "rewards/kidney_reward/mean": -0.31013429164886475, + "rewards/kidney_reward/std": 1.0193806886672974, + "rewards/length2tails_reward/mean": 0.5570563077926636, + "rewards/length2tails_reward/std": 0.45604071021080017, + "rewards/thermo_reward/mean": -0.8221579194068909, + "rewards/thermo_reward/std": 1.5529719591140747, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 266.8125, + "completions/mean_terminated_length": 266.8125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.0863744979724288, + "epoch": 0.338, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.816534399986267, + "learning_rate": 1.6845471059286886e-06, + "loss": -0.0229, + "num_tokens": 1479873.0, + "reward": 3.920487880706787, + "reward_std": 2.4281582832336426, + "rewards/fitness_reward/mean": 4.564079284667969, + "rewards/fitness_reward/std": 2.8623640537261963, + "rewards/kidney_reward/mean": -0.3698844611644745, + "rewards/kidney_reward/std": 0.9845880270004272, + "rewards/length2tails_reward/mean": 0.5217670202255249, + "rewards/length2tails_reward/std": 0.46941322088241577, + "rewards/thermo_reward/mean": -1.1781814098358154, + "rewards/thermo_reward/std": 1.6795637607574463, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 272.78125, + "completions/mean_terminated_length": 272.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.09404147509485483, + "epoch": 0.34, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7808253169059753, + "learning_rate": 1.6794413042615166e-06, + "loss": -0.0213, + "num_tokens": 1488634.0, + "reward": 5.1834716796875, + "reward_std": 1.9101296663284302, + "rewards/fitness_reward/mean": 5.6643967628479, + "rewards/fitness_reward/std": 1.505463719367981, + "rewards/kidney_reward/mean": -0.19529178738594055, + "rewards/kidney_reward/std": 1.164594054222107, + "rewards/length2tails_reward/mean": 0.4930678606033325, + "rewards/length2tails_reward/std": 0.46066921949386597, + "rewards/thermo_reward/mean": -1.0130927562713623, + "rewards/thermo_reward/std": 1.7528146505355835, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 278.5625, + "completions/mean_terminated_length": 278.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13403520546853542, + "epoch": 0.342, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6008855104446411, + "learning_rate": 1.6743023875837233e-06, + "loss": 0.0825, + "num_tokens": 1497580.0, + "reward": 4.644342422485352, + "reward_std": 2.5929994583129883, + "rewards/fitness_reward/mean": 5.213815689086914, + "rewards/fitness_reward/std": 2.7655608654022217, + "rewards/kidney_reward/mean": -0.3567451238632202, + "rewards/kidney_reward/std": 1.0828732252120972, + "rewards/length2tails_reward/mean": 0.6193132400512695, + "rewards/length2tails_reward/std": 0.4164426624774933, + "rewards/thermo_reward/mean": -1.0918560028076172, + "rewards/thermo_reward/std": 2.0735950469970703, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 264.8125, + "completions/mean_terminated_length": 264.8125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.08579262811690569, + "epoch": 0.344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9887391328811646, + "learning_rate": 1.669130606358858e-06, + "loss": -0.0684, + "num_tokens": 1506086.0, + "reward": 4.761046409606934, + "reward_std": 2.4325408935546875, + "rewards/fitness_reward/mean": 5.2099928855896, + "rewards/fitness_reward/std": 2.369246482849121, + "rewards/kidney_reward/mean": -0.16196635365486145, + "rewards/kidney_reward/std": 1.1331666707992554, + "rewards/length2tails_reward/mean": 0.6166989803314209, + "rewards/length2tails_reward/std": 0.4662693738937378, + "rewards/thermo_reward/mean": -1.0442757606506348, + "rewards/thermo_reward/std": 1.8124314546585083, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 282.65625, + "completions/mean_terminated_length": 282.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12472502840682864, + "epoch": 0.346, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4417450428009033, + "learning_rate": 1.6639262126522415e-06, + "loss": 0.1348, + "num_tokens": 1515163.0, + "reward": 3.654130458831787, + "reward_std": 2.815221071243286, + "rewards/fitness_reward/mean": 4.169241905212402, + "rewards/fitness_reward/std": 3.082991361618042, + "rewards/kidney_reward/mean": -0.44877898693084717, + "rewards/kidney_reward/std": 0.953915536403656, + "rewards/length2tails_reward/mean": 0.46090492606163025, + "rewards/length2tails_reward/std": 0.4796352982521057, + "rewards/thermo_reward/mean": -0.8118960857391357, + "rewards/thermo_reward/std": 1.7251765727996826, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 268.46875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.09075398044660687, + "epoch": 0.348, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7016935348510742, + "learning_rate": 1.6586894601186803e-06, + "loss": -0.0006, + "num_tokens": 1523786.0, + "reward": 4.55905818939209, + "reward_std": 2.30279541015625, + "rewards/fitness_reward/mean": 5.1573991775512695, + "rewards/fitness_reward/std": 2.173652410507202, + "rewards/kidney_reward/mean": -0.48578259348869324, + "rewards/kidney_reward/std": 1.0091580152511597, + "rewards/length2tails_reward/mean": 0.5137011408805847, + "rewards/length2tails_reward/std": 0.456093966960907, + "rewards/thermo_reward/mean": -0.967749834060669, + "rewards/thermo_reward/std": 1.7618026733398438, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 275.3125, + "completions/mean_terminated_length": 275.3125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.10816240543499589, + "epoch": 0.35, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0683670043945312, + "learning_rate": 1.6534206039901055e-06, + "loss": 0.0612, + "num_tokens": 1532628.0, + "reward": 4.2663373947143555, + "reward_std": 3.062441825866699, + "rewards/fitness_reward/mean": 4.755006790161133, + "rewards/fitness_reward/std": 2.9233791828155518, + "rewards/kidney_reward/mean": -0.5185624361038208, + "rewards/kidney_reward/std": 0.9748953580856323, + "rewards/length2tails_reward/mean": 0.5505917072296143, + "rewards/length2tails_reward/std": 0.47932368516921997, + "rewards/thermo_reward/mean": -0.7340719699859619, + "rewards/thermo_reward/std": 1.7369695901870728, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 262.96875, + "completions/mean_terminated_length": 262.96875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.07547976076602936, + "epoch": 0.352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7118106484413147, + "learning_rate": 1.6481199010631309e-06, + "loss": -0.0746, + "num_tokens": 1541075.0, + "reward": 4.428964614868164, + "reward_std": 2.574803113937378, + "rewards/fitness_reward/mean": 4.689699172973633, + "rewards/fitness_reward/std": 2.8076272010803223, + "rewards/kidney_reward/mean": -0.3593696355819702, + "rewards/kidney_reward/std": 0.9871580600738525, + "rewards/length2tails_reward/mean": 0.5770972967147827, + "rewards/length2tails_reward/std": 0.4420401155948639, + "rewards/thermo_reward/mean": -0.4506470859050751, + "rewards/thermo_reward/std": 1.959010362625122, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 273.21875, + "completions/mean_terminated_length": 273.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11087105982005596, + "epoch": 0.354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9738964438438416, + "learning_rate": 1.6427876096865393e-06, + "loss": 0.0369, + "num_tokens": 1549850.0, + "reward": 4.867439270019531, + "reward_std": 3.0394482612609863, + "rewards/fitness_reward/mean": 5.090075492858887, + "rewards/fitness_reward/std": 2.84049391746521, + "rewards/kidney_reward/mean": -0.08992902934551239, + "rewards/kidney_reward/std": 1.4132450819015503, + "rewards/length2tails_reward/mean": 0.609131395816803, + "rewards/length2tails_reward/std": 0.42378756403923035, + "rewards/thermo_reward/mean": -0.6599090099334717, + "rewards/thermo_reward/std": 1.848639965057373, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 266.78125, + "completions/mean_terminated_length": 266.78125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.1331650954671204, + "epoch": 0.356, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5233911275863647, + "learning_rate": 1.6374239897486897e-06, + "loss": -0.0134, + "num_tokens": 1558419.0, + "reward": 4.055957317352295, + "reward_std": 3.2039151191711426, + "rewards/fitness_reward/mean": 4.328873634338379, + "rewards/fitness_reward/std": 3.307469606399536, + "rewards/kidney_reward/mean": -0.20703302323818207, + "rewards/kidney_reward/std": 1.0310391187667847, + "rewards/length2tails_reward/mean": 0.4950253665447235, + "rewards/length2tails_reward/std": 0.4483602046966553, + "rewards/thermo_reward/mean": -0.5863116979598999, + "rewards/thermo_reward/std": 1.6916553974151611, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1012274301610887, + "epoch": 0.358, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4355965554714203, + "learning_rate": 1.6320293026648508e-06, + "loss": -0.0079, + "num_tokens": 1567093.0, + "reward": 5.45328426361084, + "reward_std": 1.7587991952896118, + "rewards/fitness_reward/mean": 5.767385959625244, + "rewards/fitness_reward/std": 1.449892282485962, + "rewards/kidney_reward/mean": -0.20507873594760895, + "rewards/kidney_reward/std": 1.233489990234375, + "rewards/length2tails_reward/mean": 0.6022515892982483, + "rewards/length2tails_reward/std": 0.4060637056827545, + "rewards/thermo_reward/mean": -0.7242498397827148, + "rewards/thermo_reward/std": 1.6031584739685059, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.08426529681310058, + "epoch": 0.36, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.54383385181427, + "learning_rate": 1.6266038113644605e-06, + "loss": -0.0065, + "num_tokens": 1575733.0, + "reward": 5.659174919128418, + "reward_std": 1.9989067316055298, + "rewards/fitness_reward/mean": 5.87037467956543, + "rewards/fitness_reward/std": 1.3842169046401978, + "rewards/kidney_reward/mean": -0.31498047709465027, + "rewards/kidney_reward/std": 1.2313039302825928, + "rewards/length2tails_reward/mean": 0.599600613117218, + "rewards/length2tails_reward/std": 0.42347168922424316, + "rewards/thermo_reward/mean": -0.40721940994262695, + "rewards/thermo_reward/std": 1.898787021636963, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 275.84375, + "completions/mean_terminated_length": 275.84375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.10651430813595653, + "epoch": 0.362, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1690618991851807, + "learning_rate": 1.6211477802783102e-06, + "loss": 0.085, + "num_tokens": 1584592.0, + "reward": 4.864093780517578, + "reward_std": 2.611569881439209, + "rewards/fitness_reward/mean": 5.151647567749023, + "rewards/fitness_reward/std": 2.1961724758148193, + "rewards/kidney_reward/mean": -0.056402117013931274, + "rewards/kidney_reward/std": 1.097105622291565, + "rewards/length2tails_reward/mean": 0.5109585523605347, + "rewards/length2tails_reward/std": 0.44378817081451416, + "rewards/thermo_reward/mean": -0.7741846442222595, + "rewards/thermo_reward/std": 1.9916399717330933, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 261.0, + "completions/mean_terminated_length": 261.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.09026870923116803, + "epoch": 0.364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0212987661361694, + "learning_rate": 1.615661475325658e-06, + "loss": -0.0984, + "num_tokens": 1592976.0, + "reward": 4.4314727783203125, + "reward_std": 3.0072882175445557, + "rewards/fitness_reward/mean": 5.038105487823486, + "rewards/fitness_reward/std": 3.0175745487213135, + "rewards/kidney_reward/mean": -0.34879419207572937, + "rewards/kidney_reward/std": 1.0839778184890747, + "rewards/length2tails_reward/mean": 0.6546170711517334, + "rewards/length2tails_reward/std": 0.42878323793411255, + "rewards/thermo_reward/mean": -1.191779613494873, + "rewards/thermo_reward/std": 1.9129706621170044, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 279.40625, + "completions/mean_terminated_length": 279.40625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1478779586032033, + "epoch": 0.366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.895674228668213, + "learning_rate": 1.6101451639012677e-06, + "loss": 0.0838, + "num_tokens": 1601949.0, + "reward": 4.561385154724121, + "reward_std": 3.6965315341949463, + "rewards/fitness_reward/mean": 4.620615005493164, + "rewards/fitness_reward/std": 3.3410534858703613, + "rewards/kidney_reward/mean": -0.08034095913171768, + "rewards/kidney_reward/std": 1.157870888710022, + "rewards/length2tails_reward/mean": 0.5722231864929199, + "rewards/length2tails_reward/std": 0.4358516335487366, + "rewards/thermo_reward/mean": -0.3242303729057312, + "rewards/thermo_reward/std": 1.781525731086731, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.1301782624796033, + "epoch": 0.368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1593161821365356, + "learning_rate": 1.604599114862375e-06, + "loss": 0.0082, + "num_tokens": 1610623.0, + "reward": 4.000077247619629, + "reward_std": 3.2169172763824463, + "rewards/fitness_reward/mean": 4.352954864501953, + "rewards/fitness_reward/std": 3.510301113128662, + "rewards/kidney_reward/mean": -0.22353899478912354, + "rewards/kidney_reward/std": 1.2535496950149536, + "rewards/length2tails_reward/mean": 0.5596377849578857, + "rewards/length2tails_reward/std": 0.44628483057022095, + "rewards/thermo_reward/mean": -0.7620350122451782, + "rewards/thermo_reward/std": 1.802309274673462, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.0893715862184763, + "epoch": 0.37, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2550925016403198, + "learning_rate": 1.5990235985155857e-06, + "loss": 0.0254, + "num_tokens": 1619269.0, + "reward": 4.491425037384033, + "reward_std": 2.9865565299987793, + "rewards/fitness_reward/mean": 4.715381145477295, + "rewards/fitness_reward/std": 3.102266788482666, + "rewards/kidney_reward/mean": -0.037865400314331055, + "rewards/kidney_reward/std": 1.1800258159637451, + "rewards/length2tails_reward/mean": 0.448363333940506, + "rewards/length2tails_reward/std": 0.4457945227622986, + "rewards/thermo_reward/mean": -0.6342282891273499, + "rewards/thermo_reward/std": 1.8643159866333008, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 275.625, + "completions/mean_terminated_length": 275.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12593226647004485, + "epoch": 0.372, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3675857782363892, + "learning_rate": 1.5934188866037015e-06, + "loss": 0.0445, + "num_tokens": 1628121.0, + "reward": 3.7773280143737793, + "reward_std": 3.876007318496704, + "rewards/fitness_reward/mean": 4.293154716491699, + "rewards/fitness_reward/std": 3.9880714416503906, + "rewards/kidney_reward/mean": -0.42351239919662476, + "rewards/kidney_reward/std": 1.1867891550064087, + "rewards/length2tails_reward/mean": 0.6898555159568787, + "rewards/length2tails_reward/std": 0.38954636454582214, + "rewards/thermo_reward/mean": -0.9530682563781738, + "rewards/thermo_reward/std": 2.0106310844421387, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.15625, + "completions/mean_terminated_length": 263.15625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.14193418761715293, + "epoch": 0.374, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.44107985496521, + "learning_rate": 1.587785252292473e-06, + "loss": -0.0755, + "num_tokens": 1636574.0, + "reward": 5.446175575256348, + "reward_std": 2.6817963123321533, + "rewards/fitness_reward/mean": 5.443154811859131, + "rewards/fitness_reward/std": 2.656590461730957, + "rewards/kidney_reward/mean": -0.061875101178884506, + "rewards/kidney_reward/std": 1.2702064514160156, + "rewards/length2tails_reward/mean": 0.6279922723770142, + "rewards/length2tails_reward/std": 0.4024559259414673, + "rewards/thermo_reward/mean": -0.2460794299840927, + "rewards/thermo_reward/std": 2.1052651405334473, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 270.6875, + "completions/mean_terminated_length": 270.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1061564115807414, + "epoch": 0.376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2134290933609009, + "learning_rate": 1.5821229701572893e-06, + "loss": 0.0174, + "num_tokens": 1645268.0, + "reward": 4.806356430053711, + "reward_std": 2.3874363899230957, + "rewards/fitness_reward/mean": 5.02333927154541, + "rewards/fitness_reward/std": 2.3042352199554443, + "rewards/kidney_reward/mean": -0.09063389897346497, + "rewards/kidney_reward/std": 1.093129277229309, + "rewards/length2tails_reward/mean": 0.48920953273773193, + "rewards/length2tails_reward/std": 0.44646406173706055, + "rewards/thermo_reward/mean": -0.5879369974136353, + "rewards/thermo_reward/std": 1.769646167755127, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11166581884026527, + "epoch": 0.378, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3294186294078827, + "learning_rate": 1.5764323161697932e-06, + "loss": -0.0034, + "num_tokens": 1653945.0, + "reward": 6.120540618896484, + "reward_std": 1.7913304567337036, + "rewards/fitness_reward/mean": 6.076353073120117, + "rewards/fitness_reward/std": 1.2157715559005737, + "rewards/kidney_reward/mean": 0.07283775508403778, + "rewards/kidney_reward/std": 1.3584492206573486, + "rewards/length2tails_reward/mean": 0.6484661102294922, + "rewards/length2tails_reward/std": 0.38605931401252747, + "rewards/thermo_reward/mean": -0.30869537591934204, + "rewards/thermo_reward/std": 1.9456088542938232, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11868252046406269, + "epoch": 0.38, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6086766719818115, + "learning_rate": 1.5707135676844319e-06, + "loss": 0.0242, + "num_tokens": 1662654.0, + "reward": 5.367982387542725, + "reward_std": 2.7623069286346436, + "rewards/fitness_reward/mean": 5.724161148071289, + "rewards/fitness_reward/std": 2.252316951751709, + "rewards/kidney_reward/mean": -0.4771338403224945, + "rewards/kidney_reward/std": 1.0981281995773315, + "rewards/length2tails_reward/mean": 0.6890479326248169, + "rewards/length2tails_reward/std": 0.36876070499420166, + "rewards/thermo_reward/mean": -0.5797474384307861, + "rewards/thermo_reward/std": 1.96684730052948, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.10897703189402819, + "epoch": 0.382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7039817571640015, + "learning_rate": 1.564967003424938e-06, + "loss": -0.0034, + "num_tokens": 1671306.0, + "reward": 4.808743476867676, + "reward_std": 3.007719039916992, + "rewards/fitness_reward/mean": 4.748951435089111, + "rewards/fitness_reward/std": 2.986243724822998, + "rewards/kidney_reward/mean": -0.17581090331077576, + "rewards/kidney_reward/std": 1.232522964477539, + "rewards/length2tails_reward/mean": 0.5685861110687256, + "rewards/length2tails_reward/std": 0.4413212239742279, + "rewards/thermo_reward/mean": 0.011101648211479187, + "rewards/thermo_reward/std": 1.8753315210342407, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 273.84375, + "completions/mean_terminated_length": 273.84375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11015730071812868, + "epoch": 0.384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9565957188606262, + "learning_rate": 1.5591929034707466e-06, + "loss": 0.027, + "num_tokens": 1680101.0, + "reward": 5.647148132324219, + "reward_std": 2.985407829284668, + "rewards/fitness_reward/mean": 5.495434761047363, + "rewards/fitness_reward/std": 2.468876838684082, + "rewards/kidney_reward/mean": 0.06246180832386017, + "rewards/kidney_reward/std": 1.225472331047058, + "rewards/length2tails_reward/mean": 0.7170220613479614, + "rewards/length2tails_reward/std": 0.3801810145378113, + "rewards/thermo_reward/mean": -0.11754542589187622, + "rewards/thermo_reward/std": 1.9760782718658447, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.12005980499088764, + "epoch": 0.386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6544195413589478, + "learning_rate": 1.553391549243344e-06, + "loss": -0.0182, + "num_tokens": 1688763.0, + "reward": 4.954888820648193, + "reward_std": 2.4991729259490967, + "rewards/fitness_reward/mean": 5.40878963470459, + "rewards/fitness_reward/std": 2.3679397106170654, + "rewards/kidney_reward/mean": -0.49122127890586853, + "rewards/kidney_reward/std": 1.202668309211731, + "rewards/length2tails_reward/mean": 0.6947495937347412, + "rewards/length2tails_reward/std": 0.41318637132644653, + "rewards/thermo_reward/mean": -0.7639557123184204, + "rewards/thermo_reward/std": 1.8023275136947632, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12109254207462072, + "epoch": 0.388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2701301574707031, + "learning_rate": 1.5475632234925502e-06, + "loss": 0.042, + "num_tokens": 1697570.0, + "reward": 4.846124649047852, + "reward_std": 2.819533348083496, + "rewards/fitness_reward/mean": 5.228612899780273, + "rewards/fitness_reward/std": 2.733485221862793, + "rewards/kidney_reward/mean": -0.49513113498687744, + "rewards/kidney_reward/std": 1.1379722356796265, + "rewards/length2tails_reward/mean": 0.7269919514656067, + "rewards/length2tails_reward/std": 0.378961980342865, + "rewards/thermo_reward/mean": -0.6333409547805786, + "rewards/thermo_reward/std": 1.9405333995819092, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 263.8125, + "completions/mean_terminated_length": 263.8125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.10591288423165679, + "epoch": 0.39, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9085234999656677, + "learning_rate": 1.54170821028274e-06, + "loss": -0.0855, + "num_tokens": 1706044.0, + "reward": 5.504973411560059, + "reward_std": 2.731473684310913, + "rewards/fitness_reward/mean": 5.504761219024658, + "rewards/fitness_reward/std": 2.3768563270568848, + "rewards/kidney_reward/mean": -0.12942233681678772, + "rewards/kidney_reward/std": 1.1659127473831177, + "rewards/length2tails_reward/mean": 0.6184656023979187, + "rewards/length2tails_reward/std": 0.4209434986114502, + "rewards/thermo_reward/mean": -0.17938566207885742, + "rewards/thermo_reward/std": 1.864436388015747, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11573971435427666, + "epoch": 0.392, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7765392065048218, + "learning_rate": 1.5358267949789964e-06, + "loss": 0.0344, + "num_tokens": 1714808.0, + "reward": 5.148538112640381, + "reward_std": 2.2965903282165527, + "rewards/fitness_reward/mean": 5.610849857330322, + "rewards/fitness_reward/std": 2.3347136974334717, + "rewards/kidney_reward/mean": -0.3197159171104431, + "rewards/kidney_reward/std": 1.0197172164916992, + "rewards/length2tails_reward/mean": 0.6769318580627441, + "rewards/length2tails_reward/std": 0.43633151054382324, + "rewards/thermo_reward/mean": -0.9433727860450745, + "rewards/thermo_reward/std": 2.0293006896972656, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 275.0625, + "completions/mean_terminated_length": 275.0625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11778477020561695, + "epoch": 0.394, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.153994560241699, + "learning_rate": 1.5299192642332049e-06, + "loss": 0.0619, + "num_tokens": 1723642.0, + "reward": 5.358760356903076, + "reward_std": 2.5633435249328613, + "rewards/fitness_reward/mean": 5.500892639160156, + "rewards/fitness_reward/std": 2.3947198390960693, + "rewards/kidney_reward/mean": -0.1386294811964035, + "rewards/kidney_reward/std": 1.0449038743972778, + "rewards/length2tails_reward/mean": 0.6533316373825073, + "rewards/length2tails_reward/std": 0.40731319785118103, + "rewards/thermo_reward/mean": -0.47230052947998047, + "rewards/thermo_reward/std": 2.0186452865600586, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12082445435225964, + "epoch": 0.396, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.474387526512146, + "learning_rate": 1.5239859059700793e-06, + "loss": -0.0076, + "num_tokens": 1732328.0, + "reward": 5.87947940826416, + "reward_std": 1.8399745225906372, + "rewards/fitness_reward/mean": 5.973363876342773, + "rewards/fitness_reward/std": 1.3069151639938354, + "rewards/kidney_reward/mean": -0.43369150161743164, + "rewards/kidney_reward/std": 1.1159157752990723, + "rewards/length2tails_reward/mean": 0.746300220489502, + "rewards/length2tails_reward/std": 0.3608926236629486, + "rewards/thermo_reward/mean": -0.1272280216217041, + "rewards/thermo_reward/std": 1.7771265506744385, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.11528191063553095, + "epoch": 0.398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6634423136711121, + "learning_rate": 1.5180270093731302e-06, + "loss": -0.0124, + "num_tokens": 1740991.0, + "reward": 5.016768932342529, + "reward_std": 2.8234593868255615, + "rewards/fitness_reward/mean": 5.254176616668701, + "rewards/fitness_reward/std": 2.630495309829712, + "rewards/kidney_reward/mean": -0.429448664188385, + "rewards/kidney_reward/std": 1.11064612865448, + "rewards/length2tails_reward/mean": 0.6609193086624146, + "rewards/length2tails_reward/std": 0.4230346083641052, + "rewards/thermo_reward/mean": -0.375826895236969, + "rewards/thermo_reward/std": 1.7914979457855225, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 278.34375, + "completions/mean_terminated_length": 278.34375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.14113888517022133, + "epoch": 0.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484531164169312, + "learning_rate": 1.5120428648705715e-06, + "loss": 0.0702, + "num_tokens": 1749930.0, + "reward": 4.45833683013916, + "reward_std": 3.1881518363952637, + "rewards/fitness_reward/mean": 5.084860324859619, + "rewards/fitness_reward/std": 3.2290408611297607, + "rewards/kidney_reward/mean": -0.6342756748199463, + "rewards/kidney_reward/std": 1.010218620300293, + "rewards/length2tails_reward/mean": 0.7552772164344788, + "rewards/length2tails_reward/std": 0.3683543801307678, + "rewards/thermo_reward/mean": -0.9964100122451782, + "rewards/thermo_reward/std": 2.1216630935668945, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 266.65625, + "completions/mean_terminated_length": 266.65625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.12928149849176407, + "epoch": 0.402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6312239170074463, + "learning_rate": 1.5060337641211636e-06, + "loss": -0.0221, + "num_tokens": 1758495.0, + "reward": 5.229574203491211, + "reward_std": 3.2260067462921143, + "rewards/fitness_reward/mean": 5.103326797485352, + "rewards/fitness_reward/std": 3.447283983230591, + "rewards/kidney_reward/mean": -0.06352463364601135, + "rewards/kidney_reward/std": 1.290382742881775, + "rewards/length2tails_reward/mean": 0.6177878379821777, + "rewards/length2tails_reward/std": 0.41295352578163147, + "rewards/thermo_reward/mean": 0.007125034928321838, + "rewards/thermo_reward/std": 2.120759963989258, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 280.5625, + "completions/mean_terminated_length": 280.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14953106082975864, + "epoch": 0.404, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.925811529159546, + "learning_rate": 1.5e-06, + "loss": 0.1222, + "num_tokens": 1767505.0, + "reward": 5.001443386077881, + "reward_std": 3.1893393993377686, + "rewards/fitness_reward/mean": 5.0646443367004395, + "rewards/fitness_reward/std": 2.9247078895568848, + "rewards/kidney_reward/mean": -0.13537967205047607, + "rewards/kidney_reward/std": 1.2401924133300781, + "rewards/length2tails_reward/mean": 0.6247571706771851, + "rewards/length2tails_reward/std": 0.42491886019706726, + "rewards/thermo_reward/mean": -0.30340126156806946, + "rewards/thermo_reward/std": 1.8612476587295532, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 265.9375, + "completions/mean_terminated_length": 265.9375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.12770347204059362, + "epoch": 0.406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9480125904083252, + "learning_rate": 1.4939418665842309e-06, + "loss": -0.012, + "num_tokens": 1776047.0, + "reward": 5.106375694274902, + "reward_std": 3.3707869052886963, + "rewards/fitness_reward/mean": 5.213399887084961, + "rewards/fitness_reward/std": 3.1237714290618896, + "rewards/kidney_reward/mean": 0.16533440351486206, + "rewards/kidney_reward/std": 1.2188705205917358, + "rewards/length2tails_reward/mean": 0.6536604166030884, + "rewards/length2tails_reward/std": 0.41311925649642944, + "rewards/thermo_reward/mean": -0.7062134742736816, + "rewards/thermo_reward/std": 1.8557100296020508, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.11901164799928665, + "epoch": 0.408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2556489706039429, + "learning_rate": 1.4878596591387326e-06, + "loss": -0.0105, + "num_tokens": 1784682.0, + "reward": 5.204383850097656, + "reward_std": 3.6209285259246826, + "rewards/fitness_reward/mean": 5.13774299621582, + "rewards/fitness_reward/std": 3.0786664485931396, + "rewards/kidney_reward/mean": 0.12757733464241028, + "rewards/kidney_reward/std": 1.3261386156082153, + "rewards/length2tails_reward/mean": 0.7046136856079102, + "rewards/length2tails_reward/std": 0.3748266100883484, + "rewards/thermo_reward/mean": -0.34660279750823975, + "rewards/thermo_reward/std": 1.6802020072937012, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 283.0, + "completions/mean_terminated_length": 267.80645751953125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.1570515912026167, + "epoch": 0.41, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4501383304595947, + "learning_rate": 1.4817536741017151e-06, + "loss": 0.1255, + "num_tokens": 1793770.0, + "reward": 5.006608009338379, + "reward_std": 3.3003995418548584, + "rewards/fitness_reward/mean": 5.15645170211792, + "rewards/fitness_reward/std": 3.2961273193359375, + "rewards/kidney_reward/mean": -0.27005457878112793, + "rewards/kidney_reward/std": 1.1989458799362183, + "rewards/length2tails_reward/mean": 0.7012870907783508, + "rewards/length2tails_reward/std": 0.39138534665107727, + "rewards/thermo_reward/mean": -0.38027605414390564, + "rewards/thermo_reward/std": 1.9341998100280762, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 280.4375, + "completions/mean_terminated_length": 280.4375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.17629681713879108, + "epoch": 0.412, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0244832038879395, + "learning_rate": 1.4756242090702753e-06, + "loss": -0.0026, + "num_tokens": 1802776.0, + "reward": 5.560153484344482, + "reward_std": 2.769331693649292, + "rewards/fitness_reward/mean": 5.607787132263184, + "rewards/fitness_reward/std": 2.7524917125701904, + "rewards/kidney_reward/mean": -0.1659838706254959, + "rewards/kidney_reward/std": 1.1878471374511719, + "rewards/length2tails_reward/mean": 0.7757689952850342, + "rewards/length2tails_reward/std": 0.3364262878894806, + "rewards/thermo_reward/mean": -0.3171682059764862, + "rewards/thermo_reward/std": 2.1596550941467285, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.18510614708065987, + "epoch": 0.414, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8747198581695557, + "learning_rate": 1.4694715627858908e-06, + "loss": -0.0064, + "num_tokens": 1811491.0, + "reward": 4.97410774230957, + "reward_std": 4.071346759796143, + "rewards/fitness_reward/mean": 4.744723320007324, + "rewards/fitness_reward/std": 3.860710620880127, + "rewards/kidney_reward/mean": -0.04718928039073944, + "rewards/kidney_reward/std": 1.0953369140625, + "rewards/length2tails_reward/mean": 0.7307334542274475, + "rewards/length2tails_reward/std": 0.36576011776924133, + "rewards/thermo_reward/mean": 0.14059212803840637, + "rewards/thermo_reward/std": 1.8796796798706055, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.17540221381932497, + "epoch": 0.416, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8250596523284912, + "learning_rate": 1.4632960351198617e-06, + "loss": -0.0062, + "num_tokens": 1820202.0, + "reward": 4.9364848136901855, + "reward_std": 3.8868162631988525, + "rewards/fitness_reward/mean": 4.679059028625488, + "rewards/fitness_reward/std": 3.7941133975982666, + "rewards/kidney_reward/mean": 0.12615957856178284, + "rewards/kidney_reward/std": 1.3088197708129883, + "rewards/length2tails_reward/mean": 0.7231643199920654, + "rewards/length2tails_reward/std": 0.3762193024158478, + "rewards/thermo_reward/mean": 0.027110159397125244, + "rewards/thermo_reward/std": 2.1192996501922607, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 274.5625, + "completions/mean_terminated_length": 274.5625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.14119845256209373, + "epoch": 0.418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1828947067260742, + "learning_rate": 1.4570979270586943e-06, + "loss": -0.0941, + "num_tokens": 1829020.0, + "reward": 4.36683988571167, + "reward_std": 3.606301784515381, + "rewards/fitness_reward/mean": 4.932483673095703, + "rewards/fitness_reward/std": 3.6595046520233154, + "rewards/kidney_reward/mean": -0.5763438940048218, + "rewards/kidney_reward/std": 0.9190284013748169, + "rewards/length2tails_reward/mean": 0.7905447483062744, + "rewards/length2tails_reward/std": 0.33445608615875244, + "rewards/thermo_reward/mean": -0.9502164125442505, + "rewards/thermo_reward/std": 2.0955722332000732, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 279.1875, + "completions/mean_terminated_length": 279.1875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.18222328554838896, + "epoch": 0.42, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.100304126739502, + "learning_rate": 1.4508775406894306e-06, + "loss": 0.0794, + "num_tokens": 1837986.0, + "reward": 5.177816867828369, + "reward_std": 3.4072232246398926, + "rewards/fitness_reward/mean": 5.094880104064941, + "rewards/fitness_reward/std": 3.1885640621185303, + "rewards/kidney_reward/mean": -0.011289328336715698, + "rewards/kidney_reward/std": 1.2078779935836792, + "rewards/length2tails_reward/mean": 0.6524442434310913, + "rewards/length2tails_reward/std": 0.4188677966594696, + "rewards/thermo_reward/mean": -0.14905908703804016, + "rewards/thermo_reward/std": 1.923143982887268, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.13147749193012714, + "epoch": 0.422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3564049005508423, + "learning_rate": 1.4446351791849273e-06, + "loss": 0.0129, + "num_tokens": 1846750.0, + "reward": 5.809633255004883, + "reward_std": 3.2329182624816895, + "rewards/fitness_reward/mean": 5.548314094543457, + "rewards/fitness_reward/std": 2.9709882736206055, + "rewards/kidney_reward/mean": 0.1898169368505478, + "rewards/kidney_reward/std": 1.201176643371582, + "rewards/length2tails_reward/mean": 0.7714184522628784, + "rewards/length2tails_reward/std": 0.33874425292015076, + "rewards/thermo_reward/mean": -0.05288762226700783, + "rewards/thermo_reward/std": 1.9934961795806885, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 280.5625, + "completions/mean_terminated_length": 280.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.148380596190691, + "epoch": 0.424, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1136276721954346, + "learning_rate": 1.4383711467890773e-06, + "loss": 0.1043, + "num_tokens": 1855760.0, + "reward": 5.165239334106445, + "reward_std": 3.7728660106658936, + "rewards/fitness_reward/mean": 5.120419502258301, + "rewards/fitness_reward/std": 3.3934967517852783, + "rewards/kidney_reward/mean": -0.13617633283138275, + "rewards/kidney_reward/std": 1.219999074935913, + "rewards/length2tails_reward/mean": 0.6721340417861938, + "rewards/length2tails_reward/std": 0.37673434615135193, + "rewards/thermo_reward/mean": -0.11025162041187286, + "rewards/thermo_reward/std": 2.06036639213562, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 279.40625, + "completions/mean_terminated_length": 279.40625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.14004177413880825, + "epoch": 0.426, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.625136375427246, + "learning_rate": 1.4320857488019824e-06, + "loss": 0.094, + "num_tokens": 1864733.0, + "reward": 5.282578468322754, + "reward_std": 2.975006103515625, + "rewards/fitness_reward/mean": 5.515386581420898, + "rewards/fitness_reward/std": 3.1072726249694824, + "rewards/kidney_reward/mean": 0.1614631861448288, + "rewards/kidney_reward/std": 1.4302942752838135, + "rewards/length2tails_reward/mean": 0.7769113779067993, + "rewards/length2tails_reward/std": 0.3295917212963104, + "rewards/thermo_reward/mean": -1.015535831451416, + "rewards/thermo_reward/std": 2.2133193016052246, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.1491750106215477, + "epoch": 0.428, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8770372867584229, + "learning_rate": 1.4257792915650725e-06, + "loss": -0.0188, + "num_tokens": 1873420.0, + "reward": 5.447704315185547, + "reward_std": 3.252995491027832, + "rewards/fitness_reward/mean": 5.594566345214844, + "rewards/fitness_reward/std": 2.7997264862060547, + "rewards/kidney_reward/mean": -0.48993390798568726, + "rewards/kidney_reward/std": 1.1592376232147217, + "rewards/length2tails_reward/mean": 0.8639081716537476, + "rewards/length2tails_reward/std": 0.25261324644088745, + "rewards/thermo_reward/mean": -0.23574352264404297, + "rewards/thermo_reward/std": 2.2791671752929688, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 266.28125, + "completions/mean_terminated_length": 266.28125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.16167325340211391, + "epoch": 0.43, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9747185707092285, + "learning_rate": 1.419452082446177e-06, + "loss": -0.0616, + "num_tokens": 1881973.0, + "reward": 5.3051886558532715, + "reward_std": 2.940993070602417, + "rewards/fitness_reward/mean": 5.305299282073975, + "rewards/fitness_reward/std": 2.8063652515411377, + "rewards/kidney_reward/mean": -0.20494236052036285, + "rewards/kidney_reward/std": 1.165364384651184, + "rewards/length2tails_reward/mean": 0.7368552684783936, + "rewards/length2tails_reward/std": 0.37866973876953125, + "rewards/thermo_reward/mean": -0.1637059599161148, + "rewards/thermo_reward/std": 1.9145219326019287, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12187883397564292, + "epoch": 0.432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6405536532402039, + "learning_rate": 1.4131044298245418e-06, + "loss": -0.0104, + "num_tokens": 1890686.0, + "reward": 5.89649772644043, + "reward_std": 2.0160536766052246, + "rewards/fitness_reward/mean": 5.973363876342773, + "rewards/fitness_reward/std": 1.3069151639938354, + "rewards/kidney_reward/mean": -0.33626803755760193, + "rewards/kidney_reward/std": 1.403552532196045, + "rewards/length2tails_reward/mean": 0.6964526176452637, + "rewards/length2tails_reward/std": 0.41551831364631653, + "rewards/thermo_reward/mean": -0.1656903624534607, + "rewards/thermo_reward/std": 1.8359384536743164, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 267.8125, + "completions/mean_terminated_length": 267.8125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.13484436459839344, + "epoch": 0.434, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.467594861984253, + "learning_rate": 1.4067366430758004e-06, + "loss": -0.045, + "num_tokens": 1899288.0, + "reward": 4.882638454437256, + "reward_std": 2.991011619567871, + "rewards/fitness_reward/mean": 5.251405715942383, + "rewards/fitness_reward/std": 2.9786839485168457, + "rewards/kidney_reward/mean": -0.5040518045425415, + "rewards/kidney_reward/std": 1.2910178899765015, + "rewards/length2tails_reward/mean": 0.7873901128768921, + "rewards/length2tails_reward/std": 0.35044264793395996, + "rewards/thermo_reward/mean": -0.6271776556968689, + "rewards/thermo_reward/std": 2.0984861850738525, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12841098196804523, + "epoch": 0.436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4370866119861603, + "learning_rate": 1.400349032556895e-06, + "loss": 0.0072, + "num_tokens": 1907962.0, + "reward": 5.410283088684082, + "reward_std": 3.0423872470855713, + "rewards/fitness_reward/mean": 5.468869209289551, + "rewards/fitness_reward/std": 2.5450143814086914, + "rewards/kidney_reward/mean": -0.1920437067747116, + "rewards/kidney_reward/std": 1.4910067319869995, + "rewards/length2tails_reward/mean": 0.681010365486145, + "rewards/length2tails_reward/std": 0.38825103640556335, + "rewards/thermo_reward/mean": -0.26563379168510437, + "rewards/thermo_reward/std": 2.002378225326538, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 272.5625, + "completions/mean_terminated_length": 272.5625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.16766507271677256, + "epoch": 0.438, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.009949207305908, + "learning_rate": 1.393941909590951e-06, + "loss": 0.0082, + "num_tokens": 1916716.0, + "reward": 4.8232502937316895, + "reward_std": 3.339430809020996, + "rewards/fitness_reward/mean": 5.122108459472656, + "rewards/fitness_reward/std": 3.3906185626983643, + "rewards/kidney_reward/mean": -0.31029796600341797, + "rewards/kidney_reward/std": 1.0983688831329346, + "rewards/length2tails_reward/mean": 0.7626557350158691, + "rewards/length2tails_reward/std": 0.3781687319278717, + "rewards/thermo_reward/mean": -0.6687458753585815, + "rewards/thermo_reward/std": 2.1708645820617676, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.59375, + "completions/mean_terminated_length": 267.59375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.10749762738123536, + "epoch": 0.44, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5059139728546143, + "learning_rate": 1.3875155864521028e-06, + "loss": -0.0014, + "num_tokens": 1925311.0, + "reward": 6.180752277374268, + "reward_std": 1.7418967485427856, + "rewards/fitness_reward/mean": 6.076353073120117, + "rewards/fitness_reward/std": 1.2157716751098633, + "rewards/kidney_reward/mean": 0.008736655116081238, + "rewards/kidney_reward/std": 1.3900187015533447, + "rewards/length2tails_reward/mean": 0.5785348415374756, + "rewards/length2tails_reward/std": 0.3666441738605499, + "rewards/thermo_reward/mean": -0.0892057865858078, + "rewards/thermo_reward/std": 1.8389132022857666, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 271.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12798354495316744, + "epoch": 0.442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3278595209121704, + "learning_rate": 1.3810703763502743e-06, + "loss": -0.0031, + "num_tokens": 1934037.0, + "reward": 6.395440101623535, + "reward_std": 1.592313289642334, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.09271426498889923, + "rewards/kidney_reward/std": 1.35648775100708, + "rewards/length2tails_reward/mean": 0.8260397911071777, + "rewards/length2tails_reward/std": 0.28667116165161133, + "rewards/thermo_reward/mean": -0.30006542801856995, + "rewards/thermo_reward/std": 2.1212196350097656, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 276.78125, + "completions/mean_terminated_length": 276.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1318447468802333, + "epoch": 0.444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2908531427383423, + "learning_rate": 1.374606593415912e-06, + "loss": 0.0577, + "num_tokens": 1942926.0, + "reward": 5.608564853668213, + "reward_std": 2.8474864959716797, + "rewards/fitness_reward/mean": 5.65217399597168, + "rewards/fitness_reward/std": 2.6082570552825928, + "rewards/kidney_reward/mean": -0.11263138800859451, + "rewards/kidney_reward/std": 1.3512107133865356, + "rewards/length2tails_reward/mean": 0.7171779870986938, + "rewards/length2tails_reward/std": 0.3670293688774109, + "rewards/thermo_reward/mean": -0.3331752121448517, + "rewards/thermo_reward/std": 2.078946828842163, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11585133709013462, + "epoch": 0.446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5567060708999634, + "learning_rate": 1.3681245526846781e-06, + "loss": -0.0016, + "num_tokens": 1951603.0, + "reward": 5.846627235412598, + "reward_std": 1.9089988470077515, + "rewards/fitness_reward/mean": 6.076353073120117, + "rewards/fitness_reward/std": 1.2157716751098633, + "rewards/kidney_reward/mean": -0.44368448853492737, + "rewards/kidney_reward/std": 1.363287329673767, + "rewards/length2tails_reward/mean": 0.7362430095672607, + "rewards/length2tails_reward/std": 0.3501988351345062, + "rewards/thermo_reward/mean": -0.3838888108730316, + "rewards/thermo_reward/std": 2.1188266277313232, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12520943023264408, + "epoch": 0.448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4506056010723114, + "learning_rate": 1.361624570082092e-06, + "loss": -0.0028, + "num_tokens": 1960293.0, + "reward": 5.522380828857422, + "reward_std": 2.2656264305114746, + "rewards/fitness_reward/mean": 5.861753940582275, + "rewards/fitness_reward/std": 2.045424222946167, + "rewards/kidney_reward/mean": -0.05382596701383591, + "rewards/kidney_reward/std": 1.3001813888549805, + "rewards/length2tails_reward/mean": 0.7590430974960327, + "rewards/length2tails_reward/std": 0.3836628794670105, + "rewards/thermo_reward/mean": -1.0044409036636353, + "rewards/thermo_reward/std": 2.1063218116760254, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1355925602838397, + "epoch": 0.45, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0922815799713135, + "learning_rate": 1.3551069624081371e-06, + "loss": 0.0038, + "num_tokens": 1969001.0, + "reward": 5.617181777954102, + "reward_std": 2.6733498573303223, + "rewards/fitness_reward/mean": 5.599064826965332, + "rewards/fitness_reward/std": 2.7911274433135986, + "rewards/kidney_reward/mean": -0.12046325206756592, + "rewards/kidney_reward/std": 1.403606653213501, + "rewards/length2tails_reward/mean": 0.7542349100112915, + "rewards/length2tails_reward/std": 0.3374869227409363, + "rewards/thermo_reward/mean": -0.2204195111989975, + "rewards/thermo_reward/std": 1.9917058944702148, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.71875, + "completions/mean_terminated_length": 269.71875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.15132870338857174, + "epoch": 0.452, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.819502592086792, + "learning_rate": 1.3485720473218152e-06, + "loss": -0.0074, + "num_tokens": 1977664.0, + "reward": 5.580442428588867, + "reward_std": 2.7787983417510986, + "rewards/fitness_reward/mean": 5.590456962585449, + "rewards/fitness_reward/std": 2.43131947517395, + "rewards/kidney_reward/mean": -0.43889474868774414, + "rewards/kidney_reward/std": 1.0456558465957642, + "rewards/length2tails_reward/mean": 0.7070517539978027, + "rewards/length2tails_reward/std": 0.37388545274734497, + "rewards/thermo_reward/mean": 0.06533941626548767, + "rewards/thermo_reward/std": 2.0557384490966797, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1192570636048913, + "epoch": 0.454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5030652284622192, + "learning_rate": 1.3420201433256689e-06, + "loss": -0.0049, + "num_tokens": 1986337.0, + "reward": 5.5395050048828125, + "reward_std": 1.7637563943862915, + "rewards/fitness_reward/mean": 5.87037467956543, + "rewards/fitness_reward/std": 1.3842169046401978, + "rewards/kidney_reward/mean": -0.41122961044311523, + "rewards/kidney_reward/std": 1.297377109527588, + "rewards/length2tails_reward/mean": 0.7080308794975281, + "rewards/length2tails_reward/std": 0.37696659564971924, + "rewards/thermo_reward/mean": -0.6045256853103638, + "rewards/thermo_reward/std": 1.8804250955581665, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 279.78125, + "completions/mean_terminated_length": 279.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1760839968919754, + "epoch": 0.456, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4869167804718018, + "learning_rate": 1.3354515697502551e-06, + "loss": 0.0889, + "num_tokens": 1995322.0, + "reward": 5.507205963134766, + "reward_std": 3.225966691970825, + "rewards/fitness_reward/mean": 5.797521591186523, + "rewards/fitness_reward/std": 2.754859685897827, + "rewards/kidney_reward/mean": -0.3719463348388672, + "rewards/kidney_reward/std": 1.0769273042678833, + "rewards/length2tails_reward/mean": 0.8598864078521729, + "rewards/length2tails_reward/std": 0.22975370287895203, + "rewards/thermo_reward/mean": -0.6386279463768005, + "rewards/thermo_reward/std": 2.108335494995117, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 291.34375, + "completions/mean_terminated_length": 291.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.17017631325870752, + "epoch": 0.458, + "frac_reward_zero_std": 0.0, + "grad_norm": NaN, + "learning_rate": 1.3288666467385831e-06, + "loss": 0.16, + "num_tokens": 2004677.0, + "reward": 5.238361358642578, + "reward_std": 3.569972276687622, + "rewards/fitness_reward/mean": 5.240414142608643, + "rewards/fitness_reward/std": 3.045217752456665, + "rewards/kidney_reward/mean": -0.010104184970259666, + "rewards/kidney_reward/std": 1.4684005975723267, + "rewards/length2tails_reward/mean": 0.8191745281219482, + "rewards/length2tails_reward/std": 0.2940421402454376, + "rewards/thermo_reward/mean": -0.40358853340148926, + "rewards/thermo_reward/std": 2.2815184593200684, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13159669190645218, + "epoch": 0.46, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3491942584514618, + "learning_rate": 1.3288666467385831e-06, + "loss": 0.0035, + "num_tokens": 2013377.0, + "reward": 6.155538082122803, + "reward_std": 1.6413859128952026, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.5473537445068359, + "rewards/kidney_reward/std": 1.161616563796997, + "rewards/length2tails_reward/mean": 0.8180912733078003, + "rewards/length2tails_reward/std": 0.2914470434188843, + "rewards/thermo_reward/mean": -0.3212563991546631, + "rewards/thermo_reward/std": 2.1364221572875977, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 276.0, + "completions/mean_terminated_length": 276.0, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.18014958128333092, + "epoch": 0.462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8534668684005737, + "learning_rate": 1.3222656952305111e-06, + "loss": 0.0644, + "num_tokens": 2022241.0, + "reward": 4.984916687011719, + "reward_std": 3.7312705516815186, + "rewards/fitness_reward/mean": 5.267140865325928, + "rewards/fitness_reward/std": 3.2565789222717285, + "rewards/kidney_reward/mean": -0.46993765234947205, + "rewards/kidney_reward/std": 1.2665092945098877, + "rewards/length2tails_reward/mean": 0.7773940563201904, + "rewards/length2tails_reward/std": 0.2976566553115845, + "rewards/thermo_reward/mean": -0.48320838809013367, + "rewards/thermo_reward/std": 1.9547299146652222, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13126275781542063, + "epoch": 0.464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4601180851459503, + "learning_rate": 1.3156490369471024e-06, + "loss": 0.0114, + "num_tokens": 2030975.0, + "reward": 6.162424087524414, + "reward_std": 1.498247742652893, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": -0.6099071502685547, + "rewards/kidney_reward/std": 1.1115127801895142, + "rewards/length2tails_reward/mean": 0.7627843618392944, + "rewards/length2tails_reward/std": 0.28889045119285583, + "rewards/thermo_reward/mean": 0.19467884302139282, + "rewards/thermo_reward/std": 2.1010055541992188, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 297.09375, + "completions/mean_terminated_length": 282.3548278808594, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.18137010838836432, + "epoch": 0.466, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2529149055480957, + "learning_rate": 1.3090169943749473e-06, + "loss": 0.1627, + "num_tokens": 2040514.0, + "reward": 5.853922367095947, + "reward_std": 3.2280406951904297, + "rewards/fitness_reward/mean": 5.682772636413574, + "rewards/fitness_reward/std": 2.829298257827759, + "rewards/kidney_reward/mean": -0.005696475505828857, + "rewards/kidney_reward/std": 1.4939913749694824, + "rewards/length2tails_reward/mean": 0.833533763885498, + "rewards/length2tails_reward/std": 0.2704388499259949, + "rewards/thermo_reward/mean": -0.06877095997333527, + "rewards/thermo_reward/std": 1.9118250608444214, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 269.5625, + "completions/mean_terminated_length": 269.5625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.13818693347275257, + "epoch": 0.468, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.66115403175354, + "learning_rate": 1.3023698907504446e-06, + "loss": -0.0141, + "num_tokens": 2049172.0, + "reward": 5.821286201477051, + "reward_std": 2.581409215927124, + "rewards/fitness_reward/mean": 5.822164535522461, + "rewards/fitness_reward/std": 2.2371702194213867, + "rewards/kidney_reward/mean": -0.13880418241024017, + "rewards/kidney_reward/std": 1.2737301588058472, + "rewards/length2tails_reward/mean": 0.7588136196136475, + "rewards/length2tails_reward/std": 0.34267228841781616, + "rewards/thermo_reward/mean": -0.2423594444990158, + "rewards/thermo_reward/std": 1.951093316078186, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1545769814401865, + "epoch": 0.47, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.714938998222351, + "learning_rate": 1.2957080500440467e-06, + "loss": 0.056, + "num_tokens": 2057968.0, + "reward": 5.967822074890137, + "reward_std": 3.091534376144409, + "rewards/fitness_reward/mean": 5.846134662628174, + "rewards/fitness_reward/std": 2.5937700271606445, + "rewards/kidney_reward/mean": 0.10892915725708008, + "rewards/kidney_reward/std": 1.498690128326416, + "rewards/length2tails_reward/mean": 0.7655143737792969, + "rewards/length2tails_reward/std": 0.3138628900051117, + "rewards/thermo_reward/mean": -0.2483123540878296, + "rewards/thermo_reward/std": 2.0395705699920654, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10781193990260363, + "epoch": 0.472, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49131742119789124, + "learning_rate": 1.2890317969444716e-06, + "loss": -0.0025, + "num_tokens": 2066652.0, + "reward": 5.7716875076293945, + "reward_std": 2.1354894638061523, + "rewards/fitness_reward/mean": 6.086244583129883, + "rewards/fitness_reward/std": 1.8417463302612305, + "rewards/kidney_reward/mean": -0.2309199869632721, + "rewards/kidney_reward/std": 1.3746833801269531, + "rewards/length2tails_reward/mean": 0.8065834045410156, + "rewards/length2tails_reward/std": 0.27748697996139526, + "rewards/thermo_reward/mean": -0.8014854192733765, + "rewards/thermo_reward/std": 2.1019039154052734, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 279.75, + "completions/mean_terminated_length": 279.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14223221316933632, + "epoch": 0.474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.983418583869934, + "learning_rate": 1.2823414568428766e-06, + "loss": 0.114, + "num_tokens": 2075636.0, + "reward": 6.0223798751831055, + "reward_std": 2.5816240310668945, + "rewards/fitness_reward/mean": 6.011655807495117, + "rewards/fitness_reward/std": 2.2284603118896484, + "rewards/kidney_reward/mean": -0.22647760808467865, + "rewards/kidney_reward/std": 1.2773698568344116, + "rewards/length2tails_reward/mean": 0.8090081214904785, + "rewards/length2tails_reward/std": 0.32567527890205383, + "rewards/thermo_reward/mean": -0.15657764673233032, + "rewards/thermo_reward/std": 2.0863871574401855, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14971571415662766, + "epoch": 0.476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5143944025039673, + "learning_rate": 1.275637355816999e-06, + "loss": 0.0018, + "num_tokens": 2084375.0, + "reward": 6.407557487487793, + "reward_std": 2.3935375213623047, + "rewards/fitness_reward/mean": 6.113137245178223, + "rewards/fitness_reward/std": 2.182605266571045, + "rewards/kidney_reward/mean": -0.023072291165590286, + "rewards/kidney_reward/std": 1.416988492012024, + "rewards/length2tails_reward/mean": 0.8817094564437866, + "rewards/length2tails_reward/std": 0.18478648364543915, + "rewards/thermo_reward/mean": 0.17105701565742493, + "rewards/thermo_reward/std": 1.9090884923934937, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 280.6875, + "completions/mean_terminated_length": 280.6875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14836317393928766, + "epoch": 0.478, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8189572095870972, + "learning_rate": 1.2689198206152656e-06, + "loss": 0.1236, + "num_tokens": 2093389.0, + "reward": 5.878040313720703, + "reward_std": 2.821176528930664, + "rewards/fitness_reward/mean": 5.928430080413818, + "rewards/fitness_reward/std": 2.644589424133301, + "rewards/kidney_reward/mean": -0.15635544061660767, + "rewards/kidney_reward/std": 1.3480225801467896, + "rewards/length2tails_reward/mean": 0.8759337067604065, + "rewards/length2tails_reward/std": 0.20841535925865173, + "rewards/thermo_reward/mean": -0.3823922872543335, + "rewards/thermo_reward/std": 2.18420147895813, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1411373158916831, + "epoch": 0.48, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6222948431968689, + "learning_rate": 1.2621891786408648e-06, + "loss": -0.0029, + "num_tokens": 2102134.0, + "reward": 6.234904766082764, + "reward_std": 1.6686646938323975, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": -0.16512131690979004, + "rewards/kidney_reward/std": 1.1555894613265991, + "rewards/length2tails_reward/mean": 0.8744947910308838, + "rewards/length2tails_reward/std": 0.242760568857193, + "rewards/thermo_reward/mean": -0.16100062429904938, + "rewards/thermo_reward/std": 2.1503944396972656, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 292.1875, + "completions/mean_terminated_length": 277.2903137207031, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.171529040671885, + "epoch": 0.482, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.054434776306152, + "learning_rate": 1.2554457579357905e-06, + "loss": 0.2517, + "num_tokens": 2111516.0, + "reward": 5.733481407165527, + "reward_std": 2.8645641803741455, + "rewards/fitness_reward/mean": 5.781833648681641, + "rewards/fitness_reward/std": 2.8185718059539795, + "rewards/kidney_reward/mean": -0.21167702972888947, + "rewards/kidney_reward/std": 1.3747280836105347, + "rewards/length2tails_reward/mean": 0.8289727568626404, + "rewards/length2tails_reward/std": 0.2659308910369873, + "rewards/thermo_reward/mean": -0.2995145916938782, + "rewards/thermo_reward/std": 2.0944507122039795, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 283.125, + "completions/mean_terminated_length": 283.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1622915817424655, + "epoch": 0.484, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.460477590560913, + "learning_rate": 1.2486898871648551e-06, + "loss": 0.1185, + "num_tokens": 2120608.0, + "reward": 5.520163536071777, + "reward_std": 3.4349632263183594, + "rewards/fitness_reward/mean": 5.4481120109558105, + "rewards/fitness_reward/std": 3.304231643676758, + "rewards/kidney_reward/mean": -0.07317894697189331, + "rewards/kidney_reward/std": 1.4033994674682617, + "rewards/length2tails_reward/mean": 0.8261253833770752, + "rewards/length2tails_reward/std": 0.265546977519989, + "rewards/thermo_reward/mean": -0.1957801878452301, + "rewards/thermo_reward/std": 2.2139532566070557, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1435080012306571, + "epoch": 0.486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4143276810646057, + "learning_rate": 1.2419218955996676e-06, + "loss": 0.002, + "num_tokens": 2129322.0, + "reward": 5.902777671813965, + "reward_std": 3.1238839626312256, + "rewards/fitness_reward/mean": 5.9073591232299805, + "rewards/fitness_reward/std": 2.3329126834869385, + "rewards/kidney_reward/mean": -0.12808813154697418, + "rewards/kidney_reward/std": 1.4451590776443481, + "rewards/length2tails_reward/mean": 0.8207823038101196, + "rewards/length2tails_reward/std": 0.26687997579574585, + "rewards/thermo_reward/mean": -0.2914661467075348, + "rewards/thermo_reward/std": 2.187708854675293, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13894751016050577, + "epoch": 0.488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44220390915870667, + "learning_rate": 1.23514211310259e-06, + "loss": 0.0021, + "num_tokens": 2138040.0, + "reward": 6.006943225860596, + "reward_std": 2.443209648132324, + "rewards/fitness_reward/mean": 5.999410629272461, + "rewards/fitness_reward/std": 1.8257852792739868, + "rewards/kidney_reward/mean": -0.348245769739151, + "rewards/kidney_reward/std": 1.2318572998046875, + "rewards/length2tails_reward/mean": 0.8122061491012573, + "rewards/length2tails_reward/std": 0.276017963886261, + "rewards/thermo_reward/mean": -0.042792417109012604, + "rewards/thermo_reward/std": 2.094027280807495, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 266.3125, + "completions/mean_terminated_length": 266.3125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.17298025824129581, + "epoch": 0.49, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7220354080200195, + "learning_rate": 1.2283508701106558e-06, + "loss": -0.0307, + "num_tokens": 2146594.0, + "reward": 4.9828290939331055, + "reward_std": 3.9370384216308594, + "rewards/fitness_reward/mean": 4.851231098175049, + "rewards/fitness_reward/std": 3.5942587852478027, + "rewards/kidney_reward/mean": -0.17364293336868286, + "rewards/kidney_reward/std": 1.478359580039978, + "rewards/length2tails_reward/mean": 0.7703011631965637, + "rewards/length2tails_reward/std": 0.34750691056251526, + "rewards/thermo_reward/mean": 0.05168786644935608, + "rewards/thermo_reward/std": 2.085308074951172, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.3125, + "completions/mean_terminated_length": 271.3125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.15128333121538162, + "epoch": 0.492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6833405494689941, + "learning_rate": 1.2215484976194675e-06, + "loss": -0.0132, + "num_tokens": 2155308.0, + "reward": 6.069390296936035, + "reward_std": 2.2282562255859375, + "rewards/fitness_reward/mean": 5.655192852020264, + "rewards/fitness_reward/std": 2.1310982704162598, + "rewards/kidney_reward/mean": 0.05876028537750244, + "rewards/kidney_reward/std": 1.2751644849777222, + "rewards/length2tails_reward/mean": 0.8252370357513428, + "rewards/length2tails_reward/std": 0.31697702407836914, + "rewards/thermo_reward/mean": 0.35701656341552734, + "rewards/thermo_reward/std": 1.7305238246917725, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 276.0625, + "completions/mean_terminated_length": 276.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14280968811362982, + "epoch": 0.494, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45008811354637146, + "learning_rate": 1.2147353271670632e-06, + "loss": 0.0031, + "num_tokens": 2164174.0, + "reward": 6.298008918762207, + "reward_std": 1.5095510482788086, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.48818615078926086, + "rewards/kidney_reward/std": 1.0899704694747925, + "rewards/length2tails_reward/mean": 0.8062364459037781, + "rewards/length2tails_reward/std": 0.30648234486579895, + "rewards/thermo_reward/mean": -0.08955463021993637, + "rewards/thermo_reward/std": 2.0243654251098633, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.1534036574885249, + "epoch": 0.496, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.114922285079956, + "learning_rate": 1.207911690817759e-06, + "loss": 0.0352, + "num_tokens": 2172945.0, + "reward": 5.256867408752441, + "reward_std": 3.398367404937744, + "rewards/fitness_reward/mean": 5.528585433959961, + "rewards/fitness_reward/std": 2.7048287391662598, + "rewards/kidney_reward/mean": -0.4768460988998413, + "rewards/kidney_reward/std": 1.2065118551254272, + "rewards/length2tails_reward/mean": 0.8892650008201599, + "rewards/length2tails_reward/std": 0.20420534908771515, + "rewards/thermo_reward/mean": -0.5112224221229553, + "rewards/thermo_reward/std": 2.252676010131836, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.1748004462569952, + "epoch": 0.498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6925712823867798, + "learning_rate": 1.2010779211459648e-06, + "loss": 0.0527, + "num_tokens": 2181871.0, + "reward": 5.066378593444824, + "reward_std": 3.6943271160125732, + "rewards/fitness_reward/mean": 5.181694507598877, + "rewards/fitness_reward/std": 3.5308194160461426, + "rewards/kidney_reward/mean": 0.019244499504566193, + "rewards/kidney_reward/std": 1.2347512245178223, + "rewards/length2tails_reward/mean": 0.8258422613143921, + "rewards/length2tails_reward/std": 0.28488725423812866, + "rewards/thermo_reward/mean": -0.6627975702285767, + "rewards/thermo_reward/std": 1.9828616380691528, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15635219123214483, + "epoch": 0.5, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4665292501449585, + "learning_rate": 1.194234351219972e-06, + "loss": 0.0, + "num_tokens": 2190591.0, + "reward": 6.17600154876709, + "reward_std": 2.4434690475463867, + "rewards/fitness_reward/mean": 5.8430070877075195, + "rewards/fitness_reward/std": 2.135439872741699, + "rewards/kidney_reward/mean": 0.01794009655714035, + "rewards/kidney_reward/std": 1.4815893173217773, + "rewards/length2tails_reward/mean": 0.8287307620048523, + "rewards/length2tails_reward/std": 0.2827480435371399, + "rewards/thermo_reward/mean": 0.23368358612060547, + "rewards/thermo_reward/std": 2.0299649238586426, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 269.15625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.19010232388973236, + "epoch": 0.502, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.568342685699463, + "learning_rate": 1.1873813145857248e-06, + "loss": -0.0351, + "num_tokens": 2199236.0, + "reward": 5.8709235191345215, + "reward_std": 3.2817866802215576, + "rewards/fitness_reward/mean": 5.546199798583984, + "rewards/fitness_reward/std": 3.0127499103546143, + "rewards/kidney_reward/mean": 0.23835378885269165, + "rewards/kidney_reward/std": 1.2311129570007324, + "rewards/length2tails_reward/mean": 0.8606287837028503, + "rewards/length2tails_reward/std": 0.23748965561389923, + "rewards/thermo_reward/mean": -0.019220426678657532, + "rewards/thermo_reward/std": 1.8968557119369507, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 267.96875, + "completions/mean_terminated_length": 267.96875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.18050073459744453, + "epoch": 0.504, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.935668468475342, + "learning_rate": 1.18051914525056e-06, + "loss": -0.044, + "num_tokens": 2207843.0, + "reward": 5.597233772277832, + "reward_std": 2.9492597579956055, + "rewards/fitness_reward/mean": 5.869199752807617, + "rewards/fitness_reward/std": 2.4897172451019287, + "rewards/kidney_reward/mean": -0.33863043785095215, + "rewards/kidney_reward/std": 1.3294682502746582, + "rewards/length2tails_reward/mean": 0.8189811706542969, + "rewards/length2tails_reward/std": 0.30258670449256897, + "rewards/thermo_reward/mean": -0.6147923469543457, + "rewards/thermo_reward/std": 1.957696557044983, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14694790355861187, + "epoch": 0.506, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0416942834854126, + "learning_rate": 1.1736481776669305e-06, + "loss": 0.0087, + "num_tokens": 2216584.0, + "reward": 6.303023815155029, + "reward_std": 2.348665952682495, + "rewards/fitness_reward/mean": 6.197032451629639, + "rewards/fitness_reward/std": 1.7298640012741089, + "rewards/kidney_reward/mean": -0.2510518729686737, + "rewards/kidney_reward/std": 1.2121409177780151, + "rewards/length2tails_reward/mean": 0.8488996028900146, + "rewards/length2tails_reward/std": 0.2167581021785736, + "rewards/thermo_reward/mean": 0.03858397901058197, + "rewards/thermo_reward/std": 2.0483031272888184, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 266.40625, + "completions/mean_terminated_length": 266.40625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.16056162863969803, + "epoch": 0.508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8536911010742188, + "learning_rate": 1.1667687467161023e-06, + "loss": -0.0806, + "num_tokens": 2225141.0, + "reward": 6.463656902313232, + "reward_std": 2.581038236618042, + "rewards/fitness_reward/mean": 6.238892555236816, + "rewards/fitness_reward/std": 1.9935071468353271, + "rewards/kidney_reward/mean": 0.04028014838695526, + "rewards/kidney_reward/std": 1.5900325775146484, + "rewards/length2tails_reward/mean": 0.8447715044021606, + "rewards/length2tails_reward/std": 0.26232168078422546, + "rewards/thermo_reward/mean": -0.013136669993400574, + "rewards/thermo_reward/std": 2.1490957736968994, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 263.625, + "completions/mean_terminated_length": 263.625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "entropy": 0.13892832025885582, + "epoch": 0.51, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7996954321861267, + "learning_rate": 1.1598811876918349e-06, + "loss": -0.0899, + "num_tokens": 2233609.0, + "reward": 5.907028675079346, + "reward_std": 3.229994058609009, + "rewards/fitness_reward/mean": 5.730376243591309, + "rewards/fitness_reward/std": 2.654407262802124, + "rewards/kidney_reward/mean": -0.02263645827770233, + "rewards/kidney_reward/std": 1.5903600454330444, + "rewards/length2tails_reward/mean": 0.7845340371131897, + "rewards/length2tails_reward/std": 0.3012000024318695, + "rewards/thermo_reward/mean": -0.016326233744621277, + "rewards/thermo_reward/std": 2.06636643409729, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12667525000870228, + "epoch": 0.512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5889103412628174, + "learning_rate": 1.1529858362840382e-06, + "loss": -0.0069, + "num_tokens": 2242297.0, + "reward": 6.215231895446777, + "reward_std": 1.858242154121399, + "rewards/fitness_reward/mean": 6.076353073120117, + "rewards/fitness_reward/std": 1.2157716751098633, + "rewards/kidney_reward/mean": 0.13728436827659607, + "rewards/kidney_reward/std": 1.205039381980896, + "rewards/length2tails_reward/mean": 0.724676787853241, + "rewards/length2tails_reward/std": 0.3713679313659668, + "rewards/thermo_reward/mean": -0.22186486423015594, + "rewards/thermo_reward/std": 2.0663976669311523, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 273.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.146596341393888, + "epoch": 0.514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9780179858207703, + "learning_rate": 1.1460830285624116e-06, + "loss": 0.0013, + "num_tokens": 2251071.0, + "reward": 6.679516792297363, + "reward_std": 1.516502022743225, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2098180651664734, + "rewards/kidney_reward/std": 1.3393386602401733, + "rewards/length2tails_reward/mean": 0.8087002635002136, + "rewards/length2tails_reward/std": 0.3076275885105133, + "rewards/thermo_reward/mean": -0.2317536324262619, + "rewards/thermo_reward/std": 2.1532649993896484, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 263.53125, + "completions/mean_terminated_length": 263.53125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.16931257583200932, + "epoch": 0.516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3027231693267822, + "learning_rate": 1.1391731009600653e-06, + "loss": -0.1214, + "num_tokens": 2259536.0, + "reward": 5.622883319854736, + "reward_std": 3.0197651386260986, + "rewards/fitness_reward/mean": 5.9119553565979, + "rewards/fitness_reward/std": 2.6751906871795654, + "rewards/kidney_reward/mean": -0.5463274121284485, + "rewards/kidney_reward/std": 1.3963298797607422, + "rewards/length2tails_reward/mean": 0.9041286110877991, + "rewards/length2tails_reward/std": 0.17764364182949066, + "rewards/thermo_reward/mean": -0.48388081789016724, + "rewards/thermo_reward/std": 2.3388588428497314, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 280.84375, + "completions/mean_terminated_length": 280.84375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.16965920012444258, + "epoch": 0.518, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7378101348876953, + "learning_rate": 1.1322563902571225e-06, + "loss": 0.0972, + "num_tokens": 2268555.0, + "reward": 5.932882785797119, + "reward_std": 2.845289468765259, + "rewards/fitness_reward/mean": 5.962001800537109, + "rewards/fitness_reward/std": 2.4900543689727783, + "rewards/kidney_reward/mean": -0.1587393879890442, + "rewards/kidney_reward/std": 1.3558075428009033, + "rewards/length2tails_reward/mean": 0.868542492389679, + "rewards/length2tails_reward/std": 0.2091815173625946, + "rewards/thermo_reward/mean": -0.3337695002555847, + "rewards/thermo_reward/std": 2.0171165466308594, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 272.9375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.14380551129579544, + "epoch": 0.52, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4390329420566559, + "learning_rate": 1.1253332335643042e-06, + "loss": 0.0044, + "num_tokens": 2277321.0, + "reward": 6.579376220703125, + "reward_std": 1.263524055480957, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.2627079486846924, + "rewards/kidney_reward/std": 1.264919638633728, + "rewards/length2tails_reward/mean": 0.883327305316925, + "rewards/length2tails_reward/std": 0.17075875401496887, + "rewards/thermo_reward/mean": -0.2027987837791443, + "rewards/thermo_reward/std": 2.105802536010742, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 278.125, + "completions/mean_terminated_length": 278.125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.14420662447810173, + "epoch": 0.522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7461501955986023, + "learning_rate": 1.1184039683065012e-06, + "loss": 0.0868, + "num_tokens": 2286253.0, + "reward": 6.388462066650391, + "reward_std": 2.1272552013397217, + "rewards/fitness_reward/mean": 5.992526054382324, + "rewards/fitness_reward/std": 1.858837366104126, + "rewards/kidney_reward/mean": 0.113833948969841, + "rewards/kidney_reward/std": 1.5173331499099731, + "rewards/length2tails_reward/mean": 0.8281090259552002, + "rewards/length2tails_reward/std": 0.2703193128108978, + "rewards/thermo_reward/mean": 0.26398321986198425, + "rewards/thermo_reward/std": 1.9258671998977661, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 277.59375, + "completions/mean_terminated_length": 277.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1499346848577261, + "epoch": 0.524, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4616916179656982, + "learning_rate": 1.1114689322063254e-06, + "loss": 0.1001, + "num_tokens": 2295168.0, + "reward": 6.525530815124512, + "reward_std": 2.2752037048339844, + "rewards/fitness_reward/mean": 6.118172645568848, + "rewards/fitness_reward/std": 2.155168056488037, + "rewards/kidney_reward/mean": 0.012175392359495163, + "rewards/kidney_reward/std": 1.2831966876983643, + "rewards/length2tails_reward/mean": 0.8546762466430664, + "rewards/length2tails_reward/std": 0.2366992086172104, + "rewards/thermo_reward/mean": 0.37520337104797363, + "rewards/thermo_reward/std": 1.903710126876831, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.18243697751313448, + "epoch": 0.526, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7324366569519043, + "learning_rate": 1.1045284632676535e-06, + "loss": -0.1355, + "num_tokens": 2303680.0, + "reward": 5.568639755249023, + "reward_std": 3.6000919342041016, + "rewards/fitness_reward/mean": 5.43031120300293, + "rewards/fitness_reward/std": 3.362344741821289, + "rewards/kidney_reward/mean": 0.006486307829618454, + "rewards/kidney_reward/std": 1.351361870765686, + "rewards/length2tails_reward/mean": 0.8496532440185547, + "rewards/length2tails_reward/std": 0.27389219403266907, + "rewards/thermo_reward/mean": -0.1546546071767807, + "rewards/thermo_reward/std": 2.102027416229248, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 262.78125, + "completions/mean_terminated_length": 262.78125, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "entropy": 0.15076796151697636, + "epoch": 0.528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.42953622341156, + "learning_rate": 1.0975828997591495e-06, + "loss": -0.1263, + "num_tokens": 2312121.0, + "reward": 5.996249198913574, + "reward_std": 2.5402796268463135, + "rewards/fitness_reward/mean": 6.155496120452881, + "rewards/fitness_reward/std": 1.952719807624817, + "rewards/kidney_reward/mean": -0.03896676003932953, + "rewards/kidney_reward/std": 1.3960011005401611, + "rewards/length2tails_reward/mean": 0.6987098455429077, + "rewards/length2tails_reward/std": 0.34228160977363586, + "rewards/thermo_reward/mean": -0.6288823485374451, + "rewards/thermo_reward/std": 2.154966354370117, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.1448302799835801, + "epoch": 0.53, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9759800434112549, + "learning_rate": 1.0906325801977803e-06, + "loss": -0.0063, + "num_tokens": 2320804.0, + "reward": 6.080069541931152, + "reward_std": 3.0260355472564697, + "rewards/fitness_reward/mean": 5.839569091796875, + "rewards/fitness_reward/std": 2.6037561893463135, + "rewards/kidney_reward/mean": 0.0014719441533088684, + "rewards/kidney_reward/std": 1.2305020093917847, + "rewards/length2tails_reward/mean": 0.7766636610031128, + "rewards/length2tails_reward/std": 0.2810504734516144, + "rewards/thermo_reward/mean": 0.09119720011949539, + "rewards/thermo_reward/std": 2.016065835952759, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11841264460235834, + "epoch": 0.532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6071414351463318, + "learning_rate": 1.0836778433323157e-06, + "loss": 0.0077, + "num_tokens": 2329484.0, + "reward": 5.9090495109558105, + "reward_std": 2.454660415649414, + "rewards/fitness_reward/mean": 5.994655609130859, + "rewards/fitness_reward/std": 2.3544161319732666, + "rewards/kidney_reward/mean": -0.20192506909370422, + "rewards/kidney_reward/std": 1.1880277395248413, + "rewards/length2tails_reward/mean": 0.7841193675994873, + "rewards/length2tails_reward/std": 0.2957149147987366, + "rewards/thermo_reward/mean": -0.36134740710258484, + "rewards/thermo_reward/std": 2.038168430328369, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 285.1875, + "completions/mean_terminated_length": 270.06451416015625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.17456878162920475, + "epoch": 0.534, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8888232707977295, + "learning_rate": 1.0767190281268186e-06, + "loss": 0.1326, + "num_tokens": 2338642.0, + "reward": 5.616744041442871, + "reward_std": 3.449208974838257, + "rewards/fitness_reward/mean": 5.386440277099609, + "rewards/fitness_reward/std": 3.242466449737549, + "rewards/kidney_reward/mean": 0.167307510972023, + "rewards/kidney_reward/std": 1.3582977056503296, + "rewards/length2tails_reward/mean": 0.775815486907959, + "rewards/length2tails_reward/std": 0.2952445447444916, + "rewards/thermo_reward/mean": -0.09460898488759995, + "rewards/thermo_reward/std": 2.12760853767395, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12719732522964478, + "epoch": 0.536, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46400102972984314, + "learning_rate": 1.069756473744125e-06, + "loss": -0.0049, + "num_tokens": 2347435.0, + "reward": 5.965997219085693, + "reward_std": 1.212018370628357, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.5127175450325012, + "rewards/kidney_reward/std": 1.2551615238189697, + "rewards/length2tails_reward/mean": 0.8376985192298889, + "rewards/length2tails_reward/std": 0.2924360930919647, + "rewards/thermo_reward/mean": -0.7447768449783325, + "rewards/thermo_reward/std": 2.066016912460327, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 279.6875, + "completions/mean_terminated_length": 279.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14143760781735182, + "epoch": 0.538, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2774434089660645, + "learning_rate": 1.0627905195293135e-06, + "loss": 0.0847, + "num_tokens": 2356417.0, + "reward": 4.858714580535889, + "reward_std": 4.066253662109375, + "rewards/fitness_reward/mean": 4.707261085510254, + "rewards/fitness_reward/std": 3.9718406200408936, + "rewards/kidney_reward/mean": -0.04432570934295654, + "rewards/kidney_reward/std": 1.2094591856002808, + "rewards/length2tails_reward/mean": 0.773606538772583, + "rewards/length2tails_reward/std": 0.2912238538265228, + "rewards/thermo_reward/mean": -0.03957007825374603, + "rewards/thermo_reward/std": 1.5913993120193481, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.1580907767638564, + "epoch": 0.54, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421672821044922, + "learning_rate": 1.055821504993164e-06, + "loss": 0.0416, + "num_tokens": 2365205.0, + "reward": 5.788364410400391, + "reward_std": 2.8610990047454834, + "rewards/fitness_reward/mean": 5.786640644073486, + "rewards/fitness_reward/std": 2.7975590229034424, + "rewards/kidney_reward/mean": -0.1281442642211914, + "rewards/kidney_reward/std": 1.226645827293396, + "rewards/length2tails_reward/mean": 0.7794501781463623, + "rewards/length2tails_reward/std": 0.33686086535453796, + "rewards/thermo_reward/mean": -0.25813406705856323, + "rewards/thermo_reward/std": 2.3839523792266846, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15838243439793587, + "epoch": 0.542, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9077131748199463, + "learning_rate": 1.0488497697956134e-06, + "loss": 0.0283, + "num_tokens": 2374001.0, + "reward": 5.788833141326904, + "reward_std": 2.9266490936279297, + "rewards/fitness_reward/mean": 5.630908966064453, + "rewards/fitness_reward/std": 2.692647933959961, + "rewards/kidney_reward/mean": -0.3564684987068176, + "rewards/kidney_reward/std": 1.5307029485702515, + "rewards/length2tails_reward/mean": 0.8538310527801514, + "rewards/length2tails_reward/std": 0.21145357191562653, + "rewards/thermo_reward/mean": 0.2454012632369995, + "rewards/thermo_reward/std": 1.9245532751083374, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 281.9375, + "completions/mean_terminated_length": 281.9375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1725459909066558, + "epoch": 0.544, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.738239288330078, + "learning_rate": 1.0418756537291995e-06, + "loss": 0.1011, + "num_tokens": 2383055.0, + "reward": 6.081518173217773, + "reward_std": 2.6257402896881104, + "rewards/fitness_reward/mean": 5.872672080993652, + "rewards/fitness_reward/std": 2.471569776535034, + "rewards/kidney_reward/mean": 0.218740314245224, + "rewards/kidney_reward/std": 1.32453191280365, + "rewards/length2tails_reward/mean": 0.7809317111968994, + "rewards/length2tails_reward/std": 0.2966778874397278, + "rewards/thermo_reward/mean": -0.19151431322097778, + "rewards/thermo_reward/std": 1.905684232711792, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14585830736905336, + "epoch": 0.546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9640302658081055, + "learning_rate": 1.034899496702501e-06, + "loss": 0.0022, + "num_tokens": 2391748.0, + "reward": 5.990623474121094, + "reward_std": 2.397167921066284, + "rewards/fitness_reward/mean": 5.930706977844238, + "rewards/fitness_reward/std": 2.2439072132110596, + "rewards/kidney_reward/mean": -0.07944446057081223, + "rewards/kidney_reward/std": 1.4068588018417358, + "rewards/length2tails_reward/mean": 0.8241256475448608, + "rewards/length2tails_reward/std": 0.23539581894874573, + "rewards/thermo_reward/mean": -0.21278566122055054, + "rewards/thermo_reward/std": 2.2069005966186523, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 271.21875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.15625540539622307, + "epoch": 0.548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5687947273254395, + "learning_rate": 1.0279216387235689e-06, + "loss": 0.0167, + "num_tokens": 2400459.0, + "reward": 5.703569412231445, + "reward_std": 2.96517014503479, + "rewards/fitness_reward/mean": 5.671274185180664, + "rewards/fitness_reward/std": 2.8722054958343506, + "rewards/kidney_reward/mean": 0.06752986460924149, + "rewards/kidney_reward/std": 1.258401870727539, + "rewards/length2tails_reward/mean": 0.7752214670181274, + "rewards/length2tails_reward/std": 0.32070493698120117, + "rewards/thermo_reward/mean": -0.39054977893829346, + "rewards/thermo_reward/std": 2.105935573577881, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.1875, + "completions/mean_terminated_length": 271.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1423500245437026, + "epoch": 0.55, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6806622743606567, + "learning_rate": 1.020942419883357e-06, + "loss": 0.0033, + "num_tokens": 2409169.0, + "reward": 6.5700788497924805, + "reward_std": 1.7746098041534424, + "rewards/fitness_reward/mean": 6.296484470367432, + "rewards/fitness_reward/std": 1.6677184104919434, + "rewards/kidney_reward/mean": -0.19006654620170593, + "rewards/kidney_reward/std": 1.2162213325500488, + "rewards/length2tails_reward/mean": 0.8175604939460754, + "rewards/length2tails_reward/std": 0.2305397242307663, + "rewards/thermo_reward/mean": 0.32847630977630615, + "rewards/thermo_reward/std": 1.8085222244262695, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 280.28125, + "completions/mean_terminated_length": 280.28125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14487546402961016, + "epoch": 0.552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8299687504768372, + "learning_rate": 1.0139621803391454e-06, + "loss": 0.0467, + "num_tokens": 2418170.0, + "reward": 6.545259952545166, + "reward_std": 1.3277207612991333, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.17356184124946594, + "rewards/kidney_reward/std": 1.263597846031189, + "rewards/length2tails_reward/mean": 0.8215415477752686, + "rewards/length2tails_reward/std": 0.2258286327123642, + "rewards/thermo_reward/mean": -0.3292858600616455, + "rewards/thermo_reward/std": 2.039425849914551, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1272141383960843, + "epoch": 0.554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7546288371086121, + "learning_rate": 1.0069812602979615e-06, + "loss": 0.0003, + "num_tokens": 2426857.0, + "reward": 5.912240028381348, + "reward_std": 2.3872926235198975, + "rewards/fitness_reward/mean": 5.988445281982422, + "rewards/fitness_reward/std": 1.878536343574524, + "rewards/kidney_reward/mean": -0.03585119545459747, + "rewards/kidney_reward/std": 1.2344835996627808, + "rewards/length2tails_reward/mean": 0.7913399934768677, + "rewards/length2tails_reward/std": 0.2957223057746887, + "rewards/thermo_reward/mean": -0.512228786945343, + "rewards/thermo_reward/std": 2.159263849258423, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 279.125, + "completions/mean_terminated_length": 279.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17997045908123255, + "epoch": 0.556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.979198694229126, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 2435821.0, + "reward": 5.03554630279541, + "reward_std": 3.796201229095459, + "rewards/fitness_reward/mean": 4.998668193817139, + "rewards/fitness_reward/std": 3.7583067417144775, + "rewards/kidney_reward/mean": 0.06615039706230164, + "rewards/kidney_reward/std": 1.4136227369308472, + "rewards/length2tails_reward/mean": 0.7691352367401123, + "rewards/length2tails_reward/std": 0.32233336567878723, + "rewards/thermo_reward/mean": -0.376961886882782, + "rewards/thermo_reward/std": 1.9719232320785522, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14837611094117165, + "epoch": 0.558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6687464118003845, + "learning_rate": 9.930187397020384e-07, + "loss": -0.0023, + "num_tokens": 2444545.0, + "reward": 5.772210121154785, + "reward_std": 2.6880130767822266, + "rewards/fitness_reward/mean": 5.614042282104492, + "rewards/fitness_reward/std": 2.792747974395752, + "rewards/kidney_reward/mean": -0.2538801431655884, + "rewards/kidney_reward/std": 1.353262186050415, + "rewards/length2tails_reward/mean": 0.8017355799674988, + "rewards/length2tails_reward/std": 0.2695324122905731, + "rewards/thermo_reward/mean": 0.1693476289510727, + "rewards/thermo_reward/std": 2.0165786743164062, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 269.21875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.15388427395373583, + "epoch": 0.56, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8253613710403442, + "learning_rate": 9.860378196608547e-07, + "loss": -0.0131, + "num_tokens": 2453192.0, + "reward": 6.641109466552734, + "reward_std": 2.0452044010162354, + "rewards/fitness_reward/mean": 6.2334065437316895, + "rewards/fitness_reward/std": 2.024541139602661, + "rewards/kidney_reward/mean": -0.07310893386602402, + "rewards/kidney_reward/std": 1.2692768573760986, + "rewards/length2tails_reward/mean": 0.7746356725692749, + "rewards/length2tails_reward/std": 0.2963363826274872, + "rewards/thermo_reward/mean": 0.5011963844299316, + "rewards/thermo_reward/std": 1.5752555131912231, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 275.09375, + "completions/mean_terminated_length": 275.09375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16355876345187426, + "epoch": 0.562, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3657917976379395, + "learning_rate": 9.790575801166431e-07, + "loss": 0.0445, + "num_tokens": 2462027.0, + "reward": 5.599199295043945, + "reward_std": 3.717531204223633, + "rewards/fitness_reward/mean": 5.3801445960998535, + "rewards/fitness_reward/std": 3.222134828567505, + "rewards/kidney_reward/mean": 0.3526960015296936, + "rewards/kidney_reward/std": 1.5738078355789185, + "rewards/length2tails_reward/mean": 0.8324109315872192, + "rewards/length2tails_reward/std": 0.26209557056427, + "rewards/thermo_reward/mean": -0.3307921588420868, + "rewards/thermo_reward/std": 2.23809814453125, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.12554469238966703, + "epoch": 0.564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.652640700340271, + "learning_rate": 9.720783612764313e-07, + "loss": -0.0163, + "num_tokens": 2470667.0, + "reward": 6.005587100982666, + "reward_std": 2.7134058475494385, + "rewards/fitness_reward/mean": 6.015375137329102, + "rewards/fitness_reward/std": 2.2088727951049805, + "rewards/kidney_reward/mean": -0.1422906517982483, + "rewards/kidney_reward/std": 1.4053009748458862, + "rewards/length2tails_reward/mean": 0.7117120623588562, + "rewards/length2tails_reward/std": 0.341067373752594, + "rewards/thermo_reward/mean": -0.23314118385314941, + "rewards/thermo_reward/std": 1.9345332384109497, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14547014515846968, + "epoch": 0.566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6224685907363892, + "learning_rate": 9.651005032974993e-07, + "loss": 0.0016, + "num_tokens": 2479362.0, + "reward": 6.701007843017578, + "reward_std": 1.1542237997055054, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.002499748021364212, + "rewards/kidney_reward/std": 1.3801093101501465, + "rewards/length2tails_reward/mean": 0.792314887046814, + "rewards/length2tails_reward/std": 0.26681551337242126, + "rewards/thermo_reward/mean": 0.2327187955379486, + "rewards/thermo_reward/std": 1.6495410203933716, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.96875, + "completions/mean_terminated_length": 271.96875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.15400028508156538, + "epoch": 0.568, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3945049047470093, + "learning_rate": 9.581243462708005e-07, + "loss": 0.0046, + "num_tokens": 2488097.0, + "reward": 5.999971389770508, + "reward_std": 2.1510202884674072, + "rewards/fitness_reward/mean": 5.867298603057861, + "rewards/fitness_reward/std": 2.019099235534668, + "rewards/kidney_reward/mean": -0.004271138459444046, + "rewards/kidney_reward/std": 1.3482393026351929, + "rewards/length2tails_reward/mean": 0.7591778039932251, + "rewards/length2tails_reward/std": 0.32747241854667664, + "rewards/thermo_reward/mean": -0.10997310280799866, + "rewards/thermo_reward/std": 2.031148672103882, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13353883754462004, + "epoch": 0.57, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4228142499923706, + "learning_rate": 9.511502302043867e-07, + "loss": 0.0021, + "num_tokens": 2496780.0, + "reward": 6.48953104019165, + "reward_std": 1.3167486190795898, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.1205441802740097, + "rewards/kidney_reward/std": 1.3729273080825806, + "rewards/length2tails_reward/mean": 0.7895287275314331, + "rewards/length2tails_reward/std": 0.30240556597709656, + "rewards/thermo_reward/mean": -0.2717765271663666, + "rewards/thermo_reward/std": 1.9287211894989014, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 273.6875, + "completions/mean_terminated_length": 273.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.17670563887804747, + "epoch": 0.572, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1856436729431152, + "learning_rate": 9.441784950068361e-07, + "loss": 0.022, + "num_tokens": 2505570.0, + "reward": 5.552561283111572, + "reward_std": 3.0569052696228027, + "rewards/fitness_reward/mean": 5.5452375411987305, + "rewards/fitness_reward/std": 3.0043234825134277, + "rewards/kidney_reward/mean": -0.028089947998523712, + "rewards/kidney_reward/std": 1.2665034532546997, + "rewards/length2tails_reward/mean": 0.8060954809188843, + "rewards/length2tails_reward/std": 0.26724883913993835, + "rewards/thermo_reward/mean": -0.36030930280685425, + "rewards/thermo_reward/std": 2.0128753185272217, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.13899183459579945, + "epoch": 0.574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8525024056434631, + "learning_rate": 9.372094804706866e-07, + "loss": -0.0047, + "num_tokens": 2514271.0, + "reward": 5.807683944702148, + "reward_std": 2.6605942249298096, + "rewards/fitness_reward/mean": 5.804408073425293, + "rewards/fitness_reward/std": 2.3758177757263184, + "rewards/kidney_reward/mean": 0.0376010537147522, + "rewards/kidney_reward/std": 1.4945979118347168, + "rewards/length2tails_reward/mean": 0.8038283586502075, + "rewards/length2tails_reward/std": 0.29997193813323975, + "rewards/thermo_reward/mean": -0.4329639673233032, + "rewards/thermo_reward/std": 2.102201223373413, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13818454928696156, + "epoch": 0.576, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0629791021347046, + "learning_rate": 9.302435262558747e-07, + "loss": 0.0278, + "num_tokens": 2523044.0, + "reward": 5.959643363952637, + "reward_std": 2.1231353282928467, + "rewards/fitness_reward/mean": 5.888861656188965, + "rewards/fitness_reward/std": 1.9181643724441528, + "rewards/kidney_reward/mean": -0.033825114369392395, + "rewards/kidney_reward/std": 1.4832185506820679, + "rewards/length2tails_reward/mean": 0.8217208981513977, + "rewards/length2tails_reward/std": 0.28788596391677856, + "rewards/thermo_reward/mean": -0.23547188937664032, + "rewards/thermo_reward/std": 2.246788501739502, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 258.8125, + "completions/mean_terminated_length": 258.8125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.1594973113387823, + "epoch": 0.578, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1755261421203613, + "learning_rate": 9.232809718731813e-07, + "loss": -0.1287, + "num_tokens": 2531358.0, + "reward": 5.4473443031311035, + "reward_std": 3.200383186340332, + "rewards/fitness_reward/mean": 5.256423473358154, + "rewards/fitness_reward/std": 3.2978856563568115, + "rewards/kidney_reward/mean": 0.06129808351397514, + "rewards/kidney_reward/std": 1.3483083248138428, + "rewards/length2tails_reward/mean": 0.8028110265731812, + "rewards/length2tails_reward/std": 0.3241569399833679, + "rewards/thermo_reward/mean": -0.08086204528808594, + "rewards/thermo_reward/std": 2.1148059368133545, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.4375, + "completions/mean_terminated_length": 269.4375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12140040285885334, + "epoch": 0.58, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4034607410430908, + "learning_rate": 9.163221566676847e-07, + "loss": -0.0016, + "num_tokens": 2540012.0, + "reward": 6.423543930053711, + "reward_std": 1.8661997318267822, + "rewards/fitness_reward/mean": 6.188730716705322, + "rewards/fitness_reward/std": 1.7741566896438599, + "rewards/kidney_reward/mean": -0.32657331228256226, + "rewards/kidney_reward/std": 1.2123817205429077, + "rewards/length2tails_reward/mean": 0.7576315402984619, + "rewards/length2tails_reward/std": 0.2643510401248932, + "rewards/thermo_reward/mean": 0.4173838794231415, + "rewards/thermo_reward/std": 1.7833548784255981, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14943666849285364, + "epoch": 0.582, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.043025255203247, + "learning_rate": 9.093674198022199e-07, + "loss": 0.0795, + "num_tokens": 2548938.0, + "reward": 5.3123087882995605, + "reward_std": 3.6125266551971436, + "rewards/fitness_reward/mean": 5.243748664855957, + "rewards/fitness_reward/std": 3.3299062252044678, + "rewards/kidney_reward/mean": -0.48788362741470337, + "rewards/kidney_reward/std": 1.26163649559021, + "rewards/length2tails_reward/mean": 0.8052570223808289, + "rewards/length2tails_reward/std": 0.28455302119255066, + "rewards/thermo_reward/mean": 0.22237543761730194, + "rewards/thermo_reward/std": 1.7955067157745361, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.0625, + "completions/mean_terminated_length": 269.0625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.1679877294227481, + "epoch": 0.584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9829885959625244, + "learning_rate": 9.024171002408506e-07, + "loss": -0.0103, + "num_tokens": 2557580.0, + "reward": 5.355369567871094, + "reward_std": 3.582345724105835, + "rewards/fitness_reward/mean": 5.362346649169922, + "rewards/fitness_reward/std": 3.2837119102478027, + "rewards/kidney_reward/mean": -0.1609899401664734, + "rewards/kidney_reward/std": 1.333462119102478, + "rewards/length2tails_reward/mean": 0.744073748588562, + "rewards/length2tails_reward/std": 0.31323665380477905, + "rewards/thermo_reward/mean": -0.2250012755393982, + "rewards/thermo_reward/std": 2.1173300743103027, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 267.40625, + "completions/mean_terminated_length": 267.40625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.13892160076647997, + "epoch": 0.586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8492016792297363, + "learning_rate": 8.954715367323466e-07, + "loss": -0.1088, + "num_tokens": 2566169.0, + "reward": 5.543354034423828, + "reward_std": 2.9506261348724365, + "rewards/fitness_reward/mean": 5.78358268737793, + "rewards/fitness_reward/std": 2.8111298084259033, + "rewards/kidney_reward/mean": -0.7873251438140869, + "rewards/kidney_reward/std": 1.2815355062484741, + "rewards/length2tails_reward/mean": 0.8598273992538452, + "rewards/length2tails_reward/std": 0.22865496575832367, + "rewards/thermo_reward/mean": -0.12304553389549255, + "rewards/thermo_reward/std": 2.137643814086914, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 282.40625, + "completions/mean_terminated_length": 282.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16803938429802656, + "epoch": 0.588, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.047661304473877, + "learning_rate": 8.885310677936746e-07, + "loss": 0.077, + "num_tokens": 2575238.0, + "reward": 5.033448696136475, + "reward_std": 3.4656853675842285, + "rewards/fitness_reward/mean": 4.970721244812012, + "rewards/fitness_reward/std": 3.5634567737579346, + "rewards/kidney_reward/mean": 0.032870735973119736, + "rewards/kidney_reward/std": 1.3879930973052979, + "rewards/length2tails_reward/mean": 0.7979098558425903, + "rewards/length2tails_reward/std": 0.3307546377182007, + "rewards/thermo_reward/mean": -0.3063697814941406, + "rewards/thermo_reward/std": 2.1015470027923584, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.128823634237051, + "epoch": 0.59, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35255303978919983, + "learning_rate": 8.81596031693499e-07, + "loss": -0.001, + "num_tokens": 2583933.0, + "reward": 6.1219048500061035, + "reward_std": 2.2153446674346924, + "rewards/fitness_reward/mean": 6.1872687339782715, + "rewards/fitness_reward/std": 1.7819706201553345, + "rewards/kidney_reward/mean": -0.0763387680053711, + "rewards/kidney_reward/std": 1.3682671785354614, + "rewards/length2tails_reward/mean": 0.7683770060539246, + "rewards/length2tails_reward/std": 0.31339552998542786, + "rewards/thermo_reward/mean": -0.4385773539543152, + "rewards/thermo_reward/std": 2.0758681297302246, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.12887464184314013, + "epoch": 0.592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7549186944961548, + "learning_rate": 8.746667664356955e-07, + "loss": -0.0288, + "num_tokens": 2592597.0, + "reward": 5.737493515014648, + "reward_std": 2.3524863719940186, + "rewards/fitness_reward/mean": 6.003878116607666, + "rewards/fitness_reward/std": 2.2695066928863525, + "rewards/kidney_reward/mean": -0.4935506284236908, + "rewards/kidney_reward/std": 1.170538067817688, + "rewards/length2tails_reward/mean": 0.8755277395248413, + "rewards/length2tails_reward/std": 0.22547662258148193, + "rewards/thermo_reward/mean": -0.4769827723503113, + "rewards/thermo_reward/std": 2.1505541801452637, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.1875, + "completions/mean_terminated_length": 269.1875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.13547687977552414, + "epoch": 0.594, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8880061507225037, + "learning_rate": 8.677436097428774e-07, + "loss": 0.0043, + "num_tokens": 2601243.0, + "reward": 6.46945858001709, + "reward_std": 1.442918062210083, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.06878862529993057, + "rewards/kidney_reward/std": 1.2600641250610352, + "rewards/length2tails_reward/mean": 0.7656205892562866, + "rewards/length2tails_reward/std": 0.2914549708366394, + "rewards/thermo_reward/mean": -0.07734374701976776, + "rewards/thermo_reward/std": 2.0001652240753174, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 272.0625, + "completions/mean_terminated_length": 272.0625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.1765116062015295, + "epoch": 0.596, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8891026973724365, + "learning_rate": 8.608268990399348e-07, + "loss": 0.0201, + "num_tokens": 2609981.0, + "reward": 6.024785995483398, + "reward_std": 3.543193817138672, + "rewards/fitness_reward/mean": 5.517086982727051, + "rewards/fitness_reward/std": 3.1167924404144287, + "rewards/kidney_reward/mean": -0.03205416351556778, + "rewards/kidney_reward/std": 1.2035332918167114, + "rewards/length2tails_reward/mean": 0.8274141550064087, + "rewards/length2tails_reward/std": 0.23523229360580444, + "rewards/thermo_reward/mean": 0.6337454319000244, + "rewards/thermo_reward/std": 1.706175446510315, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 278.6875, + "completions/mean_terminated_length": 278.6875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.15879554394632578, + "epoch": 0.598, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7817070484161377, + "learning_rate": 8.539169714375885e-07, + "loss": 0.1088, + "num_tokens": 2618931.0, + "reward": 6.548776626586914, + "reward_std": 2.27897310256958, + "rewards/fitness_reward/mean": 6.193077087402344, + "rewards/fitness_reward/std": 1.7509506940841675, + "rewards/kidney_reward/mean": 0.06910586357116699, + "rewards/kidney_reward/std": 1.4774768352508545, + "rewards/length2tails_reward/mean": 0.8031742572784424, + "rewards/length2tails_reward/std": 0.27509331703186035, + "rewards/thermo_reward/mean": 0.24070586264133453, + "rewards/thermo_reward/std": 1.9352507591247559, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13626266457140446, + "epoch": 0.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3759794235229492, + "learning_rate": 8.47014163715962e-07, + "loss": 0.0122, + "num_tokens": 2627639.0, + "reward": 6.452775001525879, + "reward_std": 2.030623197555542, + "rewards/fitness_reward/mean": 6.20017147064209, + "rewards/fitness_reward/std": 1.7131540775299072, + "rewards/kidney_reward/mean": -0.03645924851298332, + "rewards/kidney_reward/std": 1.2895108461380005, + "rewards/length2tails_reward/mean": 0.7727314233779907, + "rewards/length2tails_reward/std": 0.3181847929954529, + "rewards/thermo_reward/mean": 0.15530048310756683, + "rewards/thermo_reward/std": 1.8542375564575195, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.15447265189141035, + "epoch": 0.602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0929714441299438, + "learning_rate": 8.401188123081652e-07, + "loss": -0.044, + "num_tokens": 2636274.0, + "reward": 5.9095001220703125, + "reward_std": 2.3706412315368652, + "rewards/fitness_reward/mean": 5.9298272132873535, + "rewards/fitness_reward/std": 2.1688692569732666, + "rewards/kidney_reward/mean": -0.1626996546983719, + "rewards/kidney_reward/std": 1.2477351427078247, + "rewards/length2tails_reward/mean": 0.8582019805908203, + "rewards/length2tails_reward/std": 0.24992448091506958, + "rewards/thermo_reward/mean": -0.30705544352531433, + "rewards/thermo_reward/std": 2.1065003871917725, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.28125, + "completions/mean_terminated_length": 270.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12860836926847696, + "epoch": 0.604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2735062837600708, + "learning_rate": 8.332312532838978e-07, + "loss": 0.0021, + "num_tokens": 2644955.0, + "reward": 6.560126304626465, + "reward_std": 1.2234059572219849, + "rewards/fitness_reward/mean": 6.2823309898376465, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.23910409212112427, + "rewards/kidney_reward/std": 1.4007219076156616, + "rewards/length2tails_reward/mean": 0.7388077974319458, + "rewards/length2tails_reward/std": 0.28337112069129944, + "rewards/thermo_reward/mean": -0.052918002009391785, + "rewards/thermo_reward/std": 1.9731281995773315, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 264.90625, + "completions/mean_terminated_length": 264.90625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.1408443758264184, + "epoch": 0.606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8262622952461243, + "learning_rate": 8.263518223330696e-07, + "loss": -0.0483, + "num_tokens": 2653464.0, + "reward": 5.640756130218506, + "reward_std": 3.0255091190338135, + "rewards/fitness_reward/mean": 5.2705078125, + "rewards/fitness_reward/std": 2.9505343437194824, + "rewards/kidney_reward/mean": 0.08989809453487396, + "rewards/kidney_reward/std": 1.3180209398269653, + "rewards/length2tails_reward/mean": 0.7812709212303162, + "rewards/length2tails_reward/std": 0.2842097282409668, + "rewards/thermo_reward/mean": 0.2599630355834961, + "rewards/thermo_reward/std": 1.8476253747940063, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13957977015525103, + "epoch": 0.608, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1814037561416626, + "learning_rate": 8.194808547494401e-07, + "loss": -0.002, + "num_tokens": 2662137.0, + "reward": 5.991512775421143, + "reward_std": 2.895458459854126, + "rewards/fitness_reward/mean": 5.794535160064697, + "rewards/fitness_reward/std": 2.769155263900757, + "rewards/kidney_reward/mean": -0.18175917863845825, + "rewards/kidney_reward/std": 1.2326058149337769, + "rewards/length2tails_reward/mean": 0.7655566930770874, + "rewards/length2tails_reward/std": 0.3070470690727234, + "rewards/thermo_reward/mean": 0.1929364800453186, + "rewards/thermo_reward/std": 1.990727424621582, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.16747733019292355, + "epoch": 0.61, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.186681032180786, + "learning_rate": 8.126186854142751e-07, + "loss": -0.0209, + "num_tokens": 2670825.0, + "reward": 5.879621505737305, + "reward_std": 3.4141082763671875, + "rewards/fitness_reward/mean": 5.552954196929932, + "rewards/fitness_reward/std": 3.2934844493865967, + "rewards/kidney_reward/mean": -0.05889531224966049, + "rewards/kidney_reward/std": 1.345290184020996, + "rewards/length2tails_reward/mean": 0.8045759201049805, + "rewards/length2tails_reward/std": 0.27133816480636597, + "rewards/thermo_reward/mean": 0.30994102358818054, + "rewards/thermo_reward/std": 1.9813530445098877, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 264.5625, + "completions/mean_terminated_length": 264.5625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.14618254080414772, + "epoch": 0.612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9095790386199951, + "learning_rate": 8.057656487800282e-07, + "loss": -0.0528, + "num_tokens": 2679323.0, + "reward": 5.235918998718262, + "reward_std": 3.7534329891204834, + "rewards/fitness_reward/mean": 5.308850288391113, + "rewards/fitness_reward/std": 3.433398962020874, + "rewards/kidney_reward/mean": -0.2223380208015442, + "rewards/kidney_reward/std": 1.5016045570373535, + "rewards/length2tails_reward/mean": 0.8398911952972412, + "rewards/length2tails_reward/std": 0.2600333094596863, + "rewards/thermo_reward/mean": -0.3434699773788452, + "rewards/thermo_reward/std": 2.2440428733825684, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 267.0, + "completions/mean_terminated_length": 267.0, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.17721352633088827, + "epoch": 0.614, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4808382987976074, + "learning_rate": 7.989220788540355e-07, + "loss": -0.0362, + "num_tokens": 2687899.0, + "reward": 6.460134983062744, + "reward_std": 2.5526137351989746, + "rewards/fitness_reward/mean": 5.929336071014404, + "rewards/fitness_reward/std": 2.6182987689971924, + "rewards/kidney_reward/mean": 0.3675292432308197, + "rewards/kidney_reward/std": 1.189038634300232, + "rewards/length2tails_reward/mean": 0.8435029983520508, + "rewards/length2tails_reward/std": 0.240304633975029, + "rewards/thermo_reward/mean": 0.27231690287590027, + "rewards/thermo_reward/std": 1.809454321861267, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12361082248389721, + "epoch": 0.616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5217465162277222, + "learning_rate": 7.920883091822408e-07, + "loss": -0.0029, + "num_tokens": 2696539.0, + "reward": 5.749210357666016, + "reward_std": 2.427088737487793, + "rewards/fitness_reward/mean": 5.804673671722412, + "rewards/fitness_reward/std": 2.3728742599487305, + "rewards/kidney_reward/mean": -0.2303834855556488, + "rewards/kidney_reward/std": 1.2232918739318848, + "rewards/length2tails_reward/mean": 0.7257441282272339, + "rewards/length2tails_reward/std": 0.32820820808410645, + "rewards/thermo_reward/mean": -0.24341507256031036, + "rewards/thermo_reward/std": 1.9496954679489136, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.21875, + "completions/mean_terminated_length": 272.21875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1583899725228548, + "epoch": 0.618, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7271563410758972, + "learning_rate": 7.852646728329367e-07, + "loss": 0.0016, + "num_tokens": 2705282.0, + "reward": 6.917350769042969, + "reward_std": 2.0332014560699463, + "rewards/fitness_reward/mean": 6.211165428161621, + "rewards/fitness_reward/std": 1.6548012495040894, + "rewards/kidney_reward/mean": 0.4039209187030792, + "rewards/kidney_reward/std": 1.3662333488464355, + "rewards/length2tails_reward/mean": 0.8635237216949463, + "rewards/length2tails_reward/std": 0.18739113211631775, + "rewards/thermo_reward/mean": 0.5766885280609131, + "rewards/thermo_reward/std": 1.5489522218704224, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.46875, + "completions/mean_terminated_length": 271.46875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14540214743465185, + "epoch": 0.62, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38060182332992554, + "learning_rate": 7.784515023805327e-07, + "loss": -0.0004, + "num_tokens": 2714001.0, + "reward": 5.810445308685303, + "reward_std": 3.080669641494751, + "rewards/fitness_reward/mean": 5.936678886413574, + "rewards/fitness_reward/std": 2.600179672241211, + "rewards/kidney_reward/mean": -0.6011654138565063, + "rewards/kidney_reward/std": 1.3190113306045532, + "rewards/length2tails_reward/mean": 0.8296725153923035, + "rewards/length2tails_reward/std": 0.23379944264888763, + "rewards/thermo_reward/mean": -0.06613816320896149, + "rewards/thermo_reward/std": 1.96107816696167, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12451317626982927, + "epoch": 0.622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5161978602409363, + "learning_rate": 7.716491298893441e-07, + "loss": -0.0002, + "num_tokens": 2722678.0, + "reward": 6.549075126647949, + "reward_std": 1.461757779121399, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.6216371059417725, + "rewards/kidney_reward/std": 1.185909390449524, + "rewards/length2tails_reward/mean": 0.756341814994812, + "rewards/length2tails_reward/std": 0.2793433666229248, + "rewards/thermo_reward/mean": 0.5709758996963501, + "rewards/thermo_reward/std": 1.6642720699310303, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.14000983629375696, + "epoch": 0.624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.64600670337677, + "learning_rate": 7.648578868974099e-07, + "loss": -0.0092, + "num_tokens": 2731339.0, + "reward": 5.813279628753662, + "reward_std": 2.7142364978790283, + "rewards/fitness_reward/mean": 5.919684410095215, + "rewards/fitness_reward/std": 2.6460466384887695, + "rewards/kidney_reward/mean": -0.23450270295143127, + "rewards/kidney_reward/std": 1.137681484222412, + "rewards/length2tails_reward/mean": 0.783237636089325, + "rewards/length2tails_reward/std": 0.29252496361732483, + "rewards/thermo_reward/mean": -0.36992621421813965, + "rewards/thermo_reward/std": 1.9649900197982788, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 282.8125, + "completions/mean_terminated_length": 267.6128845214844, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.14463838562369347, + "epoch": 0.626, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.689681529998779, + "learning_rate": 7.580781044003324e-07, + "loss": 0.1848, + "num_tokens": 2740421.0, + "reward": 6.179220199584961, + "reward_std": 3.108948230743408, + "rewards/fitness_reward/mean": 5.872677803039551, + "rewards/fitness_reward/std": 2.8283746242523193, + "rewards/kidney_reward/mean": 0.012635260820388794, + "rewards/kidney_reward/std": 1.3495537042617798, + "rewards/length2tails_reward/mean": 0.7865300178527832, + "rewards/length2tails_reward/std": 0.28116706013679504, + "rewards/thermo_reward/mean": 0.20718349516391754, + "rewards/thermo_reward/std": 1.7808927297592163, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 280.46875, + "completions/mean_terminated_length": 280.46875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.17828217148780823, + "epoch": 0.628, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1162149906158447, + "learning_rate": 7.513101128351453e-07, + "loss": 0.0979, + "num_tokens": 2749428.0, + "reward": 5.4172563552856445, + "reward_std": 3.199577569961548, + "rewards/fitness_reward/mean": 5.461665630340576, + "rewards/fitness_reward/std": 2.932718276977539, + "rewards/kidney_reward/mean": -0.5087636709213257, + "rewards/kidney_reward/std": 1.2203556299209595, + "rewards/length2tails_reward/mean": 0.780242919921875, + "rewards/length2tails_reward/std": 0.3494933247566223, + "rewards/thermo_reward/mean": 0.02982361614704132, + "rewards/thermo_reward/std": 2.083951711654663, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 275.78125, + "completions/mean_terminated_length": 275.78125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.261963858269155, + "epoch": 0.63, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.851346731185913, + "learning_rate": 7.445542420642096e-07, + "loss": 0.0294, + "num_tokens": 2758285.0, + "reward": 5.509346008300781, + "reward_std": 4.53924560546875, + "rewards/fitness_reward/mean": 4.731644630432129, + "rewards/fitness_reward/std": 4.147398948669434, + "rewards/kidney_reward/mean": 0.5509114861488342, + "rewards/kidney_reward/std": 1.3219982385635376, + "rewards/length2tails_reward/mean": 0.8193047046661377, + "rewards/length2tails_reward/std": 0.25560668110847473, + "rewards/thermo_reward/mean": 0.5948399901390076, + "rewards/thermo_reward/std": 1.9399741888046265, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.84375, + "completions/mean_terminated_length": 269.84375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.1475894758477807, + "epoch": 0.632, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0615999698638916, + "learning_rate": 7.378108213591354e-07, + "loss": -0.0026, + "num_tokens": 2766952.0, + "reward": 6.231350898742676, + "reward_std": 2.787792921066284, + "rewards/fitness_reward/mean": 5.961513996124268, + "rewards/fitness_reward/std": 2.4785311222076416, + "rewards/kidney_reward/mean": 0.45111918449401855, + "rewards/kidney_reward/std": 1.5606229305267334, + "rewards/length2tails_reward/mean": 0.7947746515274048, + "rewards/length2tails_reward/std": 0.26889491081237793, + "rewards/thermo_reward/mean": -0.30883270502090454, + "rewards/thermo_reward/std": 2.0008814334869385, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13641659449785948, + "epoch": 0.634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.543372631072998, + "learning_rate": 7.310801793847343e-07, + "loss": 0.0021, + "num_tokens": 2775656.0, + "reward": 6.5946245193481445, + "reward_std": 1.5121303796768188, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.062211520969867706, + "rewards/kidney_reward/std": 1.4607387781143188, + "rewards/length2tails_reward/mean": 0.799560546875, + "rewards/length2tails_reward/std": 0.27528977394104004, + "rewards/thermo_reward/mean": -0.3309165835380554, + "rewards/thermo_reward/std": 2.116473913192749, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 276.15625, + "completions/mean_terminated_length": 276.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14614746067672968, + "epoch": 0.636, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1665120124816895, + "learning_rate": 7.243626441830009e-07, + "loss": 0.0701, + "num_tokens": 2784525.0, + "reward": 6.4105424880981445, + "reward_std": 2.441462278366089, + "rewards/fitness_reward/mean": 6.148130416870117, + "rewards/fitness_reward/std": 1.9925299882888794, + "rewards/kidney_reward/mean": 0.13602781295776367, + "rewards/kidney_reward/std": 1.4797165393829346, + "rewards/length2tails_reward/mean": 0.8131794929504395, + "rewards/length2tails_reward/std": 0.2802060544490814, + "rewards/thermo_reward/mean": -0.017794162034988403, + "rewards/thermo_reward/std": 2.036123275756836, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 278.25, + "completions/mean_terminated_length": 278.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1566220736131072, + "epoch": 0.638, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6108603477478027, + "learning_rate": 7.176585431571233e-07, + "loss": 0.0987, + "num_tokens": 2793461.0, + "reward": 6.976290225982666, + "reward_std": 2.466521978378296, + "rewards/fitness_reward/mean": 6.239953994750977, + "rewards/fitness_reward/std": 1.987504005432129, + "rewards/kidney_reward/mean": 0.2159939557313919, + "rewards/kidney_reward/std": 1.3283164501190186, + "rewards/length2tails_reward/mean": 0.8266392350196838, + "rewards/length2tails_reward/std": 0.24396318197250366, + "rewards/thermo_reward/mean": 0.8433594107627869, + "rewards/thermo_reward/std": 1.7143508195877075, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1250614272430539, + "epoch": 0.64, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5148164629936218, + "learning_rate": 7.109682030555282e-07, + "loss": 0.0044, + "num_tokens": 2802121.0, + "reward": 6.539069175720215, + "reward_std": 1.664305329322815, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.029780671000480652, + "rewards/kidney_reward/std": 1.4029945135116577, + "rewards/length2tails_reward/mean": 0.7379071712493896, + "rewards/length2tails_reward/std": 0.3075510859489441, + "rewards/thermo_reward/mean": 0.17430394887924194, + "rewards/thermo_reward/std": 1.9058327674865723, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.1713531417772174, + "epoch": 0.642, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.541980028152466, + "learning_rate": 7.042919499559536e-07, + "loss": 0.0256, + "num_tokens": 2810877.0, + "reward": 6.0339860916137695, + "reward_std": 2.8362503051757812, + "rewards/fitness_reward/mean": 5.7353620529174805, + "rewards/fitness_reward/std": 2.6534714698791504, + "rewards/kidney_reward/mean": -0.24237263202667236, + "rewards/kidney_reward/std": 1.372509479522705, + "rewards/length2tails_reward/mean": 0.8674904704093933, + "rewards/length2tails_reward/std": 0.18604622781276703, + "rewards/thermo_reward/mean": 0.40587544441223145, + "rewards/thermo_reward/std": 1.596633791923523, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 281.9375, + "completions/mean_terminated_length": 281.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1418048208579421, + "epoch": 0.644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.55742609500885, + "learning_rate": 6.976301092495556e-07, + "loss": 0.025, + "num_tokens": 2819931.0, + "reward": 6.955789566040039, + "reward_std": 1.3362483978271484, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.1048911064863205, + "rewards/kidney_reward/std": 1.2173396348953247, + "rewards/length2tails_reward/mean": 0.7285305261611938, + "rewards/length2tails_reward/std": 0.3037022650241852, + "rewards/thermo_reward/mean": 0.25982531905174255, + "rewards/thermo_reward/std": 2.0253074169158936, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 267.40625, + "completions/mean_terminated_length": 267.40625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.1485171476379037, + "epoch": 0.646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6387808322906494, + "learning_rate": 6.909830056250526e-07, + "loss": -0.0624, + "num_tokens": 2828520.0, + "reward": 5.970319747924805, + "reward_std": 2.572580099105835, + "rewards/fitness_reward/mean": 5.823781967163086, + "rewards/fitness_reward/std": 2.2292187213897705, + "rewards/kidney_reward/mean": 0.08642309904098511, + "rewards/kidney_reward/std": 1.3586586713790894, + "rewards/length2tails_reward/mean": 0.8334128856658936, + "rewards/length2tails_reward/std": 0.27404242753982544, + "rewards/thermo_reward/mean": -0.21005354821681976, + "rewards/thermo_reward/std": 2.043229579925537, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.9375, + "completions/mean_terminated_length": 270.9375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1293806629255414, + "epoch": 0.648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7523674368858337, + "learning_rate": 6.843509630528976e-07, + "loss": 0.0026, + "num_tokens": 2837222.0, + "reward": 6.185390472412109, + "reward_std": 2.2776901721954346, + "rewards/fitness_reward/mean": 6.226982116699219, + "rewards/fitness_reward/std": 2.060880661010742, + "rewards/kidney_reward/mean": -0.24002555012702942, + "rewards/kidney_reward/std": 1.4971431493759155, + "rewards/length2tails_reward/mean": 0.8265526294708252, + "rewards/length2tails_reward/std": 0.18807940185070038, + "rewards/thermo_reward/mean": -0.2564352750778198, + "rewards/thermo_reward/std": 2.1351709365844727, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.71875, + "completions/mean_terminated_length": 265.71875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.14506491273641586, + "epoch": 0.65, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8249183297157288, + "learning_rate": 6.77734304769489e-07, + "loss": -0.0628, + "num_tokens": 2845757.0, + "reward": 5.609094619750977, + "reward_std": 3.602327346801758, + "rewards/fitness_reward/mean": 5.562655448913574, + "rewards/fitness_reward/std": 3.26778507232666, + "rewards/kidney_reward/mean": -0.17317365109920502, + "rewards/kidney_reward/std": 1.4439454078674316, + "rewards/length2tails_reward/mean": 0.8142207860946655, + "rewards/length2tails_reward/std": 0.2130184769630432, + "rewards/thermo_reward/mean": -0.14105884730815887, + "rewards/thermo_reward/std": 2.0546224117279053, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12599319033324718, + "epoch": 0.652, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3221113383769989, + "learning_rate": 6.711333532614167e-07, + "loss": -0.0001, + "num_tokens": 2854436.0, + "reward": 6.323266983032227, + "reward_std": 2.3031466007232666, + "rewards/fitness_reward/mean": 6.206177711486816, + "rewards/fitness_reward/std": 1.681239366531372, + "rewards/kidney_reward/mean": -0.11081783473491669, + "rewards/kidney_reward/std": 1.1887176036834717, + "rewards/length2tails_reward/mean": 0.8017334938049316, + "rewards/length2tails_reward/std": 0.22630544006824493, + "rewards/thermo_reward/mean": -0.05587046593427658, + "rewards/thermo_reward/std": 1.9467616081237793, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 273.4375, + "completions/mean_terminated_length": 273.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14385926723480225, + "epoch": 0.654, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.863189458847046, + "learning_rate": 6.645484302497451e-07, + "loss": 0.0393, + "num_tokens": 2863218.0, + "reward": 5.9234395027160645, + "reward_std": 2.1091227531433105, + "rewards/fitness_reward/mean": 6.135817527770996, + "rewards/fitness_reward/std": 2.0592422485351562, + "rewards/kidney_reward/mean": -0.16652746498584747, + "rewards/kidney_reward/std": 1.1908307075500488, + "rewards/length2tails_reward/mean": 0.8032562732696533, + "rewards/length2tails_reward/std": 0.25795942544937134, + "rewards/thermo_reward/mean": -0.6598567962646484, + "rewards/thermo_reward/std": 2.0198681354522705, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 275.4375, + "completions/mean_terminated_length": 275.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14171724021434784, + "epoch": 0.656, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0005085468292236, + "learning_rate": 6.579798566743313e-07, + "loss": 0.0739, + "num_tokens": 2872064.0, + "reward": 6.158988952636719, + "reward_std": 2.413141965866089, + "rewards/fitness_reward/mean": 6.1257853507995605, + "rewards/fitness_reward/std": 2.113737106323242, + "rewards/kidney_reward/mean": -0.23006314039230347, + "rewards/kidney_reward/std": 1.3204492330551147, + "rewards/length2tails_reward/mean": 0.7789495587348938, + "rewards/length2tails_reward/std": 0.28807762265205383, + "rewards/thermo_reward/mean": -0.09300464391708374, + "rewards/thermo_reward/std": 1.988651156425476, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1332209836691618, + "epoch": 0.658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4491146206855774, + "learning_rate": 6.51427952678185e-07, + "loss": 0.0021, + "num_tokens": 2880764.0, + "reward": 6.950940132141113, + "reward_std": 1.3148001432418823, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.06448443233966827, + "rewards/kidney_reward/std": 1.3716617822647095, + "rewards/length2tails_reward/mean": 0.7650790214538574, + "rewards/length2tails_reward/std": 0.2712799906730652, + "rewards/thermo_reward/mean": 0.40122920274734497, + "rewards/thermo_reward/std": 1.916085958480835, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.19188074674457312, + "epoch": 0.66, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4808740615844727, + "learning_rate": 6.448930375918631e-07, + "loss": 0.0101, + "num_tokens": 2889708.0, + "reward": 6.363583564758301, + "reward_std": 3.193633556365967, + "rewards/fitness_reward/mean": 5.766847133636475, + "rewards/fitness_reward/std": 2.871739625930786, + "rewards/kidney_reward/mean": 0.1560065597295761, + "rewards/kidney_reward/std": 1.4915741682052612, + "rewards/length2tails_reward/mean": 0.7530208826065063, + "rewards/length2tails_reward/std": 0.30450528860092163, + "rewards/thermo_reward/mean": 0.6609548926353455, + "rewards/thermo_reward/std": 1.5844378471374512, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1405778331682086, + "epoch": 0.662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5762265920639038, + "learning_rate": 6.383754299179078e-07, + "loss": -0.0001, + "num_tokens": 2898380.0, + "reward": 5.537961483001709, + "reward_std": 3.1746127605438232, + "rewards/fitness_reward/mean": 5.317624568939209, + "rewards/fitness_reward/std": 3.140000104904175, + "rewards/kidney_reward/mean": 0.0035296976566314697, + "rewards/kidney_reward/std": 1.4678502082824707, + "rewards/length2tails_reward/mean": 0.7269901037216187, + "rewards/length2tails_reward/std": 0.33388474583625793, + "rewards/thermo_reward/mean": 0.07364928722381592, + "rewards/thermo_reward/std": 2.0159270763397217, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.0625, + "completions/mean_terminated_length": 270.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.123139763250947, + "epoch": 0.664, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4227535724639893, + "learning_rate": 6.31875447315322e-07, + "loss": -0.004, + "num_tokens": 2907054.0, + "reward": 6.01489782333374, + "reward_std": 2.012277603149414, + "rewards/fitness_reward/mean": 6.004064559936523, + "rewards/fitness_reward/std": 1.8035767078399658, + "rewards/kidney_reward/mean": -0.061723992228507996, + "rewards/kidney_reward/std": 1.3236358165740967, + "rewards/length2tails_reward/mean": 0.7410252690315247, + "rewards/length2tails_reward/std": 0.33866503834724426, + "rewards/thermo_reward/mean": -0.28712227940559387, + "rewards/thermo_reward/std": 1.983262062072754, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 275.40625, + "completions/mean_terminated_length": 275.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14197211991995573, + "epoch": 0.666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8882272243499756, + "learning_rate": 6.253934065840879e-07, + "loss": 0.0632, + "num_tokens": 2915899.0, + "reward": 5.8689494132995605, + "reward_std": 2.980410575866699, + "rewards/fitness_reward/mean": 5.711335182189941, + "rewards/fitness_reward/std": 2.7298057079315186, + "rewards/kidney_reward/mean": -0.27478981018066406, + "rewards/kidney_reward/std": 1.34125816822052, + "rewards/length2tails_reward/mean": 0.7590400576591492, + "rewards/length2tails_reward/std": 0.3217729926109314, + "rewards/thermo_reward/mean": 0.21049821376800537, + "rewards/thermo_reward/std": 1.6486570835113525, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.16120561491698027, + "epoch": 0.668, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4252558946609497, + "learning_rate": 6.189296236497259e-07, + "loss": 0.0085, + "num_tokens": 2924676.0, + "reward": 6.017755508422852, + "reward_std": 2.7611100673675537, + "rewards/fitness_reward/mean": 5.929872512817383, + "rewards/fitness_reward/std": 2.1686391830444336, + "rewards/kidney_reward/mean": -0.19728940725326538, + "rewards/kidney_reward/std": 1.508063793182373, + "rewards/length2tails_reward/mean": 0.9154133200645447, + "rewards/length2tails_reward/std": 0.13163615763187408, + "rewards/thermo_reward/mean": -0.08465149998664856, + "rewards/thermo_reward/std": 2.1873550415039062, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.90625, + "completions/mean_terminated_length": 271.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13629860151559114, + "epoch": 0.67, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6146358847618103, + "learning_rate": 6.124844135478971e-07, + "loss": -0.0018, + "num_tokens": 2933409.0, + "reward": 6.3639020919799805, + "reward_std": 1.8899461030960083, + "rewards/fitness_reward/mean": 6.188002586364746, + "rewards/fitness_reward/std": 1.7780461311340332, + "rewards/kidney_reward/mean": 0.26472312211990356, + "rewards/kidney_reward/std": 1.3054981231689453, + "rewards/length2tails_reward/mean": 0.8750399351119995, + "rewards/length2tails_reward/std": 0.20164364576339722, + "rewards/thermo_reward/mean": -0.3504437208175659, + "rewards/thermo_reward/std": 2.15000581741333, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1350244265049696, + "epoch": 0.672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46959754824638367, + "learning_rate": 6.060580904090489e-07, + "loss": -0.0032, + "num_tokens": 2942130.0, + "reward": 6.7464599609375, + "reward_std": 1.4381029605865479, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.09605660289525986, + "rewards/kidney_reward/std": 1.3236219882965088, + "rewards/length2tails_reward/mean": 0.8153715133666992, + "rewards/length2tails_reward/std": 0.2927325665950775, + "rewards/thermo_reward/mean": 0.012558825314044952, + "rewards/thermo_reward/std": 1.980684757232666, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 278.84375, + "completions/mean_terminated_length": 278.84375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.1624396527186036, + "epoch": 0.674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9301204681396484, + "learning_rate": 5.996509674431051e-07, + "loss": -0.0045, + "num_tokens": 2951085.0, + "reward": 6.039576530456543, + "reward_std": 2.4619998931884766, + "rewards/fitness_reward/mean": 6.033344268798828, + "rewards/fitness_reward/std": 2.1146256923675537, + "rewards/kidney_reward/mean": -0.047126639634370804, + "rewards/kidney_reward/std": 1.3129688501358032, + "rewards/length2tails_reward/mean": 0.8374546766281128, + "rewards/length2tails_reward/std": 0.27305513620376587, + "rewards/thermo_reward/mean": -0.35913562774658203, + "rewards/thermo_reward/std": 2.1510767936706543, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 270.90625, + "completions/mean_terminated_length": 270.90625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12986998166888952, + "epoch": 0.676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7267351150512695, + "learning_rate": 5.932633569241999e-07, + "loss": -0.004, + "num_tokens": 2959786.0, + "reward": 6.215915679931641, + "reward_std": 1.7919178009033203, + "rewards/fitness_reward/mean": 6.293497085571289, + "rewards/fitness_reward/std": 1.6846182346343994, + "rewards/kidney_reward/mean": -0.48145976662635803, + "rewards/kidney_reward/std": 1.4974082708358765, + "rewards/length2tails_reward/mean": 0.7296417951583862, + "rewards/length2tails_reward/std": 0.3265739381313324, + "rewards/thermo_reward/mean": -0.03852371871471405, + "rewards/thermo_reward/std": 1.8306382894515991, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 277.6875, + "completions/mean_terminated_length": 277.6875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14375455770641565, + "epoch": 0.678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5397517681121826, + "learning_rate": 5.868955701754583e-07, + "loss": 0.0027, + "num_tokens": 2968704.0, + "reward": 6.217046737670898, + "reward_std": 3.013193130493164, + "rewards/fitness_reward/mean": 5.526139259338379, + "rewards/fitness_reward/std": 3.077094078063965, + "rewards/kidney_reward/mean": 0.12864208221435547, + "rewards/kidney_reward/std": 1.2796517610549927, + "rewards/length2tails_reward/mean": 0.8049825429916382, + "rewards/length2tails_reward/std": 0.2577892541885376, + "rewards/thermo_reward/mean": 0.8506805896759033, + "rewards/thermo_reward/std": 1.3328129053115845, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 282.25, + "completions/mean_terminated_length": 282.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14949253015220165, + "epoch": 0.68, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.905804395675659, + "learning_rate": 5.805479175538228e-07, + "loss": 0.1501, + "num_tokens": 2977768.0, + "reward": 6.654813766479492, + "reward_std": 2.559910774230957, + "rewards/fitness_reward/mean": 5.95985221862793, + "rewards/fitness_reward/std": 2.4967284202575684, + "rewards/kidney_reward/mean": 0.1843625009059906, + "rewards/kidney_reward/std": 1.3213741779327393, + "rewards/length2tails_reward/mean": 0.7851734161376953, + "rewards/length2tails_reward/std": 0.31144633889198303, + "rewards/thermo_reward/mean": 0.8129734992980957, + "rewards/thermo_reward/std": 1.5214051008224487, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 276.46875, + "completions/mean_terminated_length": 276.46875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1573938773944974, + "epoch": 0.682, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2604691982269287, + "learning_rate": 5.742207084349273e-07, + "loss": 0.0695, + "num_tokens": 2986647.0, + "reward": 6.545932769775391, + "reward_std": 2.2556028366088867, + "rewards/fitness_reward/mean": 6.107923984527588, + "rewards/fitness_reward/std": 2.2110416889190674, + "rewards/kidney_reward/mean": -0.17903482913970947, + "rewards/kidney_reward/std": 1.1938183307647705, + "rewards/length2tails_reward/mean": 0.9086741805076599, + "rewards/length2tails_reward/std": 0.1183965727686882, + "rewards/thermo_reward/mean": 0.6007147431373596, + "rewards/thermo_reward/std": 1.7732858657836914, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 282.0625, + "completions/mean_terminated_length": 282.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.16670695785433054, + "epoch": 0.684, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8591322898864746, + "learning_rate": 5.679142511980175e-07, + "loss": 0.1724, + "num_tokens": 2995705.0, + "reward": 6.364426612854004, + "reward_std": 2.4766600131988525, + "rewards/fitness_reward/mean": 6.218327522277832, + "rewards/fitness_reward/std": 2.109841823577881, + "rewards/kidney_reward/mean": -0.07648028433322906, + "rewards/kidney_reward/std": 1.3622918128967285, + "rewards/length2tails_reward/mean": 0.8389837741851807, + "rewards/length2tails_reward/std": 0.25265058875083923, + "rewards/thermo_reward/mean": -0.05081367492675781, + "rewards/thermo_reward/std": 2.0749711990356445, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 290.4375, + "completions/mean_terminated_length": 290.4375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.16736070439219475, + "epoch": 0.686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8917258381843567, + "learning_rate": 5.616288532109224e-07, + "loss": 0.022, + "num_tokens": 3005031.0, + "reward": 5.868127346038818, + "reward_std": 2.222395420074463, + "rewards/fitness_reward/mean": 5.939970970153809, + "rewards/fitness_reward/std": 2.117759943008423, + "rewards/kidney_reward/mean": -0.30231744050979614, + "rewards/kidney_reward/std": 1.301476240158081, + "rewards/length2tails_reward/mean": 0.8077414035797119, + "rewards/length2tails_reward/std": 0.26545435190200806, + "rewards/thermo_reward/mean": -0.24524012207984924, + "rewards/thermo_reward/std": 2.0206568241119385, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 266.65625, + "completions/mean_terminated_length": 266.65625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.13543427735567093, + "epoch": 0.688, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4829248189926147, + "learning_rate": 5.553648208150728e-07, + "loss": -0.0313, + "num_tokens": 3013596.0, + "reward": 5.708004951477051, + "reward_std": 3.3088643550872803, + "rewards/fitness_reward/mean": 5.3620171546936035, + "rewards/fitness_reward/std": 3.26958966255188, + "rewards/kidney_reward/mean": 0.10526783764362335, + "rewards/kidney_reward/std": 1.2809340953826904, + "rewards/length2tails_reward/mean": 0.6367802023887634, + "rewards/length2tails_reward/std": 0.39182284474372864, + "rewards/thermo_reward/mean": 0.2683173418045044, + "rewards/thermo_reward/std": 1.797824740409851, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 274.96875, + "completions/mean_terminated_length": 274.96875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.14805607683956623, + "epoch": 0.69, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5612566471099854, + "learning_rate": 5.491224593105694e-07, + "loss": -0.0167, + "num_tokens": 3022427.0, + "reward": 6.30277156829834, + "reward_std": 2.0895543098449707, + "rewards/fitness_reward/mean": 6.204276084899902, + "rewards/fitness_reward/std": 1.6913362741470337, + "rewards/kidney_reward/mean": -0.19850590825080872, + "rewards/kidney_reward/std": 1.399698257446289, + "rewards/length2tails_reward/mean": 0.760990560054779, + "rewards/length2tails_reward/std": 0.2998768985271454, + "rewards/thermo_reward/mean": 0.015001252293586731, + "rewards/thermo_reward/std": 2.113967180252075, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15043269284069538, + "epoch": 0.692, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.009800672531128, + "learning_rate": 5.42902072941306e-07, + "loss": 0.0068, + "num_tokens": 3031164.0, + "reward": 6.181963920593262, + "reward_std": 2.685715675354004, + "rewards/fitness_reward/mean": 5.919095039367676, + "rewards/fitness_reward/std": 2.649914264678955, + "rewards/kidney_reward/mean": -0.24342183768749237, + "rewards/kidney_reward/std": 1.2877310514450073, + "rewards/length2tails_reward/mean": 0.8092514872550964, + "rewards/length2tails_reward/std": 0.2687968909740448, + "rewards/thermo_reward/mean": 0.36453354358673096, + "rewards/thermo_reward/std": 1.8220114707946777, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1562238810583949, + "epoch": 0.694, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.349087715148926, + "learning_rate": 5.367039648801385e-07, + "loss": 0.0074, + "num_tokens": 3039895.0, + "reward": 5.90168571472168, + "reward_std": 2.823669195175171, + "rewards/fitness_reward/mean": 5.8231048583984375, + "rewards/fitness_reward/std": 2.693870782852173, + "rewards/kidney_reward/mean": -0.2964228391647339, + "rewards/kidney_reward/std": 1.2563472986221313, + "rewards/length2tails_reward/mean": 0.7685627937316895, + "rewards/length2tails_reward/std": 0.33195197582244873, + "rewards/thermo_reward/mean": 0.06930221617221832, + "rewards/thermo_reward/std": 2.0728096961975098, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.14733695331960917, + "epoch": 0.696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.797904372215271, + "learning_rate": 5.305284372141095e-07, + "loss": -0.0051, + "num_tokens": 3048551.0, + "reward": 6.2037858963012695, + "reward_std": 2.359827756881714, + "rewards/fitness_reward/mean": 6.003262996673584, + "rewards/fitness_reward/std": 2.3144755363464355, + "rewards/kidney_reward/mean": 0.26831483840942383, + "rewards/kidney_reward/std": 1.3605223894119263, + "rewards/length2tails_reward/mean": 0.7673937082290649, + "rewards/length2tails_reward/std": 0.29162222146987915, + "rewards/thermo_reward/mean": -0.2509657144546509, + "rewards/thermo_reward/std": 1.9805631637573242, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 280.0625, + "completions/mean_terminated_length": 280.0625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15698255505412817, + "epoch": 0.698, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9059972763061523, + "learning_rate": 5.243757909297246e-07, + "loss": 0.1457, + "num_tokens": 3057545.0, + "reward": 6.684830665588379, + "reward_std": 2.255929470062256, + "rewards/fitness_reward/mean": 6.252006530761719, + "rewards/fitness_reward/std": 1.9193217754364014, + "rewards/kidney_reward/mean": 0.38714808225631714, + "rewards/kidney_reward/std": 1.3580050468444824, + "rewards/length2tails_reward/mean": 0.7600257396697998, + "rewards/length2tails_reward/std": 0.33787864446640015, + "rewards/thermo_reward/mean": 0.0984865352511406, + "rewards/thermo_reward/std": 2.025115489959717, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1372505398467183, + "epoch": 0.7, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41981613636016846, + "learning_rate": 5.182463258982846e-07, + "loss": 0.0016, + "num_tokens": 3066232.0, + "reward": 6.245865821838379, + "reward_std": 1.3084803819656372, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.4447236657142639, + "rewards/kidney_reward/std": 1.3307983875274658, + "rewards/length2tails_reward/mean": 0.7939730286598206, + "rewards/length2tails_reward/std": 0.2816673517227173, + "rewards/thermo_reward/mean": -0.2311713695526123, + "rewards/thermo_reward/std": 1.8473246097564697, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 272.84375, + "completions/mean_terminated_length": 272.84375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.177174954675138, + "epoch": 0.702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8986427783966064, + "learning_rate": 5.121403408612671e-07, + "loss": -0.0039, + "num_tokens": 3074995.0, + "reward": 5.444323539733887, + "reward_std": 3.1555228233337402, + "rewards/fitness_reward/mean": 5.578423976898193, + "rewards/fitness_reward/std": 3.2130587100982666, + "rewards/kidney_reward/mean": -0.5759067535400391, + "rewards/kidney_reward/std": 1.0323132276535034, + "rewards/length2tails_reward/mean": 0.8635549545288086, + "rewards/length2tails_reward/std": 0.2309083193540573, + "rewards/thermo_reward/mean": -0.12407190352678299, + "rewards/thermo_reward/std": 2.0656275749206543, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 279.03125, + "completions/mean_terminated_length": 279.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1382511956617236, + "epoch": 0.704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28194135427474976, + "learning_rate": 5.060581334157692e-07, + "loss": 0.0046, + "num_tokens": 3083956.0, + "reward": 6.333901882171631, + "reward_std": 1.6132252216339111, + "rewards/fitness_reward/mean": 6.282331466674805, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": -0.1382146179676056, + "rewards/kidney_reward/std": 1.2211270332336426, + "rewards/length2tails_reward/mean": 0.7396640777587891, + "rewards/length2tails_reward/std": 0.34703871607780457, + "rewards/thermo_reward/mean": -0.12847588956356049, + "rewards/thermo_reward/std": 1.7511825561523438, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 266.625, + "completions/mean_terminated_length": 266.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.17926898691803217, + "epoch": 0.706, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.676492214202881, + "learning_rate": 5.000000000000002e-07, + "loss": -0.0297, + "num_tokens": 3092520.0, + "reward": 6.2205400466918945, + "reward_std": 3.0040314197540283, + "rewards/fitness_reward/mean": 5.88077449798584, + "rewards/fitness_reward/std": 2.7958946228027344, + "rewards/kidney_reward/mean": 0.011723548173904419, + "rewards/kidney_reward/std": 1.3006480932235718, + "rewards/length2tails_reward/mean": 0.6432532668113708, + "rewards/length2tails_reward/std": 0.3428514003753662, + "rewards/thermo_reward/mean": 0.34617942571640015, + "rewards/thermo_reward/std": 1.814773678779602, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.21875, + "completions/mean_terminated_length": 270.21875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13824842125177383, + "epoch": 0.708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6739991903305054, + "learning_rate": 4.939662358788364e-07, + "loss": 0.0083, + "num_tokens": 3101199.0, + "reward": 7.00583553314209, + "reward_std": 1.7048717737197876, + "rewards/fitness_reward/mean": 6.30168342590332, + "rewards/fitness_reward/std": 1.6383068561553955, + "rewards/kidney_reward/mean": -0.24639709293842316, + "rewards/kidney_reward/std": 1.4407916069030762, + "rewards/length2tails_reward/mean": 0.7735517621040344, + "rewards/length2tails_reward/std": 0.20475666224956512, + "rewards/thermo_reward/mean": 1.2679250240325928, + "rewards/thermo_reward/std": 1.1704320907592773, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.5625, + "completions/mean_terminated_length": 271.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1290579428896308, + "epoch": 0.71, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.526790976524353, + "learning_rate": 4.879571351294286e-07, + "loss": -0.001, + "num_tokens": 3109921.0, + "reward": 6.389036655426025, + "reward_std": 1.4514721632003784, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.377413809299469, + "rewards/kidney_reward/std": 1.091863989830017, + "rewards/length2tails_reward/mean": 0.8268851041793823, + "rewards/length2tails_reward/std": 0.26292723417282104, + "rewards/thermo_reward/mean": -0.23457345366477966, + "rewards/thermo_reward/std": 2.134856700897217, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.13011477049440145, + "epoch": 0.712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.507643461227417, + "learning_rate": 4.819729906268699e-07, + "loss": -0.0111, + "num_tokens": 3118587.0, + "reward": 6.725968837738037, + "reward_std": 2.484793186187744, + "rewards/fitness_reward/mean": 6.2299323081970215, + "rewards/fitness_reward/std": 2.0441930294036865, + "rewards/kidney_reward/mean": 0.29642677307128906, + "rewards/kidney_reward/std": 1.4404501914978027, + "rewards/length2tails_reward/mean": 0.8288535475730896, + "rewards/length2tails_reward/std": 0.2590929865837097, + "rewards/thermo_reward/mean": 0.2812193036079407, + "rewards/thermo_reward/std": 1.9625108242034912, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 273.40625, + "completions/mean_terminated_length": 273.40625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.1574456738308072, + "epoch": 0.714, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.192049503326416, + "learning_rate": 4.76014094029921e-07, + "loss": 0.0221, + "num_tokens": 3127368.0, + "reward": 5.9911274909973145, + "reward_std": 3.1412923336029053, + "rewards/fitness_reward/mean": 5.862482070922852, + "rewards/fitness_reward/std": 2.868088960647583, + "rewards/kidney_reward/mean": -0.2520975172519684, + "rewards/kidney_reward/std": 1.3299691677093506, + "rewards/length2tails_reward/mean": 0.8562111258506775, + "rewards/length2tails_reward/std": 0.2258239984512329, + "rewards/thermo_reward/mean": 0.08128249645233154, + "rewards/thermo_reward/std": 1.9431575536727905, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 277.53125, + "completions/mean_terminated_length": 277.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1446541864424944, + "epoch": 0.716, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.883546829223633, + "learning_rate": 4.700807357667952e-07, + "loss": 0.0751, + "num_tokens": 3136281.0, + "reward": 6.001834869384766, + "reward_std": 2.6639564037323, + "rewards/fitness_reward/mean": 5.9749755859375, + "rewards/fitness_reward/std": 2.4309489727020264, + "rewards/kidney_reward/mean": -0.41944435238838196, + "rewards/kidney_reward/std": 1.414008378982544, + "rewards/length2tails_reward/mean": 0.8256665468215942, + "rewards/length2tails_reward/std": 0.2446584701538086, + "rewards/thermo_reward/mean": 0.06032890826463699, + "rewards/thermo_reward/std": 1.9380780458450317, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.1585037438198924, + "epoch": 0.718, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5597310066223145, + "learning_rate": 4.641732050210031e-07, + "loss": -0.0062, + "num_tokens": 3144952.0, + "reward": 5.679251670837402, + "reward_std": 3.744248151779175, + "rewards/fitness_reward/mean": 5.753543853759766, + "rewards/fitness_reward/std": 2.92307186126709, + "rewards/kidney_reward/mean": 0.015622451901435852, + "rewards/kidney_reward/std": 1.5301791429519653, + "rewards/length2tails_reward/mean": 0.7561187148094177, + "rewards/length2tails_reward/std": 0.3244445323944092, + "rewards/thermo_reward/mean": -0.5422676205635071, + "rewards/thermo_reward/std": 2.146629810333252, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14853220619261265, + "epoch": 0.72, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22003161907196045, + "learning_rate": 4.5829178971726023e-07, + "loss": 0.0001, + "num_tokens": 3153644.0, + "reward": 6.998676300048828, + "reward_std": 1.162030816078186, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.1410554051399231, + "rewards/kidney_reward/std": 1.3983737230300903, + "rewards/length2tails_reward/mean": 0.8080669641494751, + "rewards/length2tails_reward/std": 0.23570886254310608, + "rewards/thermo_reward/mean": 0.2696669101715088, + "rewards/thermo_reward/std": 1.6834814548492432, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.12786609400063753, + "epoch": 0.722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.924838662147522, + "learning_rate": 4.524367765074498e-07, + "loss": -0.0082, + "num_tokens": 3162341.0, + "reward": 5.59239387512207, + "reward_std": 3.2712528705596924, + "rewards/fitness_reward/mean": 5.735169410705566, + "rewards/fitness_reward/std": 2.9949986934661865, + "rewards/kidney_reward/mean": -0.12405352294445038, + "rewards/kidney_reward/std": 1.2700321674346924, + "rewards/length2tails_reward/mean": 0.8160407543182373, + "rewards/length2tails_reward/std": 0.25917312502861023, + "rewards/thermo_reward/mean": -0.5695180892944336, + "rewards/thermo_reward/std": 1.9807058572769165, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1366393668577075, + "epoch": 0.724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4617418348789215, + "learning_rate": 4.46608450756656e-07, + "loss": 0.0093, + "num_tokens": 3171036.0, + "reward": 6.625120162963867, + "reward_std": 1.5828049182891846, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.11791878938674927, + "rewards/kidney_reward/std": 1.4317349195480347, + "rewards/length2tails_reward/mean": 0.8119157552719116, + "rewards/length2tails_reward/std": 0.22039470076560974, + "rewards/thermo_reward/mean": -0.014416981488466263, + "rewards/thermo_reward/std": 1.7909175157546997, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 262.875, + "completions/mean_terminated_length": 262.875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.13373654335737228, + "epoch": 0.726, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0905619859695435, + "learning_rate": 4.408070965292533e-07, + "loss": -0.0696, + "num_tokens": 3179480.0, + "reward": 5.402747631072998, + "reward_std": 3.670199155807495, + "rewards/fitness_reward/mean": 5.362958908081055, + "rewards/fitness_reward/std": 3.5691020488739014, + "rewards/kidney_reward/mean": -0.31657668948173523, + "rewards/kidney_reward/std": 1.1466575860977173, + "rewards/length2tails_reward/mean": 0.8078165054321289, + "rewards/length2tails_reward/std": 0.234373077750206, + "rewards/thermo_reward/mean": -0.007755070924758911, + "rewards/thermo_reward/std": 1.9276933670043945, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 269.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11955827847123146, + "epoch": 0.728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9351580142974854, + "learning_rate": 4.350329965750621e-07, + "loss": -0.0021, + "num_tokens": 3188146.0, + "reward": 6.064743995666504, + "reward_std": 1.4324169158935547, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.6712437868118286, + "rewards/kidney_reward/std": 1.1112077236175537, + "rewards/length2tails_reward/mean": 0.7376230359077454, + "rewards/length2tails_reward/std": 0.32265642285346985, + "rewards/thermo_reward/mean": -0.3387202024459839, + "rewards/thermo_reward/std": 2.0547823905944824, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 274.5625, + "completions/mean_terminated_length": 274.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14470462873578072, + "epoch": 0.73, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8790856003761292, + "learning_rate": 4.292864323155684e-07, + "loss": 0.0258, + "num_tokens": 3196964.0, + "reward": 6.670239448547363, + "reward_std": 1.5476105213165283, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.13486161828041077, + "rewards/kidney_reward/std": 1.4245083332061768, + "rewards/length2tails_reward/mean": 0.7783125638961792, + "rewards/length2tails_reward/std": 0.2922722101211548, + "rewards/thermo_reward/mean": 0.10956567525863647, + "rewards/thermo_reward/std": 2.0721426010131836, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 276.5625, + "completions/mean_terminated_length": 276.5625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1494081998243928, + "epoch": 0.732, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.519752025604248, + "learning_rate": 4.235676838302068e-07, + "loss": 0.1007, + "num_tokens": 3205846.0, + "reward": 6.206664562225342, + "reward_std": 2.2861545085906982, + "rewards/fitness_reward/mean": 6.025056838989258, + "rewards/fitness_reward/std": 2.1580092906951904, + "rewards/kidney_reward/mean": -0.3018892705440521, + "rewards/kidney_reward/std": 1.277760624885559, + "rewards/length2tails_reward/mean": 0.7835294008255005, + "rewards/length2tails_reward/std": 0.27964353561401367, + "rewards/thermo_reward/mean": 0.2733404040336609, + "rewards/thermo_reward/std": 1.9387136697769165, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.12632002774626017, + "epoch": 0.734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7130632400512695, + "learning_rate": 4.1787702984271065e-07, + "loss": -0.0029, + "num_tokens": 3214486.0, + "reward": 6.317443370819092, + "reward_std": 2.8365418910980225, + "rewards/fitness_reward/mean": 5.861456871032715, + "rewards/fitness_reward/std": 2.524892807006836, + "rewards/kidney_reward/mean": 0.6378341913223267, + "rewards/kidney_reward/std": 1.403192400932312, + "rewards/length2tails_reward/mean": 0.7120263576507568, + "rewards/length2tails_reward/std": 0.33472928404808044, + "rewards/thermo_reward/mean": -0.08187372982501984, + "rewards/thermo_reward/std": 1.9261077642440796, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.11630972567945719, + "epoch": 0.736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0709792375564575, + "learning_rate": 4.1221474770752696e-07, + "loss": 0.0438, + "num_tokens": 3223250.0, + "reward": 6.468087196350098, + "reward_std": 2.417856454849243, + "rewards/fitness_reward/mean": 6.030662536621094, + "rewards/fitness_reward/std": 2.1286466121673584, + "rewards/kidney_reward/mean": 0.2095390260219574, + "rewards/kidney_reward/std": 1.2892457246780396, + "rewards/length2tails_reward/mean": 0.7105699777603149, + "rewards/length2tails_reward/std": 0.3106299340724945, + "rewards/thermo_reward/mean": 0.3100256323814392, + "rewards/thermo_reward/std": 1.8363381624221802, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1587747959420085, + "epoch": 0.738, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5487451553344727, + "learning_rate": 4.0658111339629865e-07, + "loss": 0.0362, + "num_tokens": 3232035.0, + "reward": 6.189671516418457, + "reward_std": 2.8210692405700684, + "rewards/fitness_reward/mean": 5.8552961349487305, + "rewards/fitness_reward/std": 2.550931930541992, + "rewards/kidney_reward/mean": -0.002792835235595703, + "rewards/kidney_reward/std": 1.2694661617279053, + "rewards/length2tails_reward/mean": 0.7921147346496582, + "rewards/length2tails_reward/std": 0.24774512648582458, + "rewards/thermo_reward/mean": 0.27548542618751526, + "rewards/thermo_reward/std": 1.771033525466919, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 275.84375, + "completions/mean_terminated_length": 275.84375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.14940356370061636, + "epoch": 0.74, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5113012790679932, + "learning_rate": 4.0097640148441423e-07, + "loss": 0.0281, + "num_tokens": 3240894.0, + "reward": 5.955854415893555, + "reward_std": 3.596100091934204, + "rewards/fitness_reward/mean": 5.546163558959961, + "rewards/fitness_reward/std": 3.3078951835632324, + "rewards/kidney_reward/mean": 0.030694488435983658, + "rewards/kidney_reward/std": 1.2919914722442627, + "rewards/length2tails_reward/mean": 0.7410632371902466, + "rewards/length2tails_reward/std": 0.31063371896743774, + "rewards/thermo_reward/mean": 0.41815584897994995, + "rewards/thermo_reward/std": 1.7940014600753784, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 282.15625, + "completions/mean_terminated_length": 282.15625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.18381345830857754, + "epoch": 0.742, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0370407104492188, + "learning_rate": 3.9540088513762516e-07, + "loss": 0.103, + "num_tokens": 3249955.0, + "reward": 4.9476799964904785, + "reward_std": 3.550820827484131, + "rewards/fitness_reward/mean": 5.036832332611084, + "rewards/fitness_reward/std": 3.907076120376587, + "rewards/kidney_reward/mean": -0.40763381123542786, + "rewards/kidney_reward/std": 1.3715983629226685, + "rewards/length2tails_reward/mean": 0.8609472513198853, + "rewards/length2tails_reward/std": 0.2464415431022644, + "rewards/thermo_reward/mean": -0.20114412903785706, + "rewards/thermo_reward/std": 2.1375131607055664, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 269.53125, + "completions/mean_terminated_length": 269.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1285235472023487, + "epoch": 0.744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.580424964427948, + "learning_rate": 3.8985483609873236e-07, + "loss": 0.0101, + "num_tokens": 3258612.0, + "reward": 6.584819316864014, + "reward_std": 1.7636280059814453, + "rewards/fitness_reward/mean": 6.2823309898376465, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.24670317769050598, + "rewards/kidney_reward/std": 1.4007341861724854, + "rewards/length2tails_reward/mean": 0.6969554424285889, + "rewards/length2tails_reward/std": 0.3184390068054199, + "rewards/thermo_reward/mean": 0.00979556143283844, + "rewards/thermo_reward/std": 2.098444700241089, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 269.25, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.1606553541496396, + "epoch": 0.746, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34814190864563, + "learning_rate": 3.843385246743417e-07, + "loss": -0.006, + "num_tokens": 3267260.0, + "reward": 5.879519939422607, + "reward_std": 3.6545047760009766, + "rewards/fitness_reward/mean": 5.286998271942139, + "rewards/fitness_reward/std": 3.4983344078063965, + "rewards/kidney_reward/mean": 0.09259563684463501, + "rewards/kidney_reward/std": 1.2750647068023682, + "rewards/length2tails_reward/mean": 0.7489024996757507, + "rewards/length2tails_reward/std": 0.29031893610954285, + "rewards/thermo_reward/mean": 0.7179964780807495, + "rewards/thermo_reward/std": 1.3379138708114624, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13412442337721586, + "epoch": 0.748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4042009711265564, + "learning_rate": 3.788522197216897e-07, + "loss": -0.0003, + "num_tokens": 3275978.0, + "reward": 6.59251594543457, + "reward_std": 1.5096096992492676, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.1605461984872818, + "rewards/kidney_reward/std": 1.2568833827972412, + "rewards/length2tails_reward/mean": 0.8646030426025391, + "rewards/length2tails_reward/std": 0.2068766951560974, + "rewards/thermo_reward/mean": 0.14263558387756348, + "rewards/thermo_reward/std": 1.953226923942566, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 282.03125, + "completions/mean_terminated_length": 282.03125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17999477125704288, + "epoch": 0.75, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6332778930664062, + "learning_rate": 3.7339618863553976e-07, + "loss": 0.1251, + "num_tokens": 3285035.0, + "reward": 6.153130054473877, + "reward_std": 3.041475534439087, + "rewards/fitness_reward/mean": 5.8121442794799805, + "rewards/fitness_reward/std": 2.697699546813965, + "rewards/kidney_reward/mean": 0.07375882565975189, + "rewards/kidney_reward/std": 1.289566159248352, + "rewards/length2tails_reward/mean": 0.8197746276855469, + "rewards/length2tails_reward/std": 0.26130786538124084, + "rewards/thermo_reward/mean": 0.19832536578178406, + "rewards/thermo_reward/std": 1.8575512170791626, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13965389877557755, + "epoch": 0.752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7165160775184631, + "learning_rate": 3.679706973351491e-07, + "loss": 0.0033, + "num_tokens": 3293718.0, + "reward": 5.530478477478027, + "reward_std": 2.9073171615600586, + "rewards/fitness_reward/mean": 5.87794828414917, + "rewards/fitness_reward/std": 2.8070197105407715, + "rewards/kidney_reward/mean": -0.1837131381034851, + "rewards/kidney_reward/std": 1.370072603225708, + "rewards/length2tails_reward/mean": 0.7681834697723389, + "rewards/length2tails_reward/std": 0.33813896775245667, + "rewards/thermo_reward/mean": -0.8953179121017456, + "rewards/thermo_reward/std": 1.9587002992630005, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.8125, + "completions/mean_terminated_length": 270.8125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12181603256613016, + "epoch": 0.754, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47920435667037964, + "learning_rate": 3.625760102513102e-07, + "loss": -0.0005, + "num_tokens": 3302416.0, + "reward": 5.9532647132873535, + "reward_std": 2.242292642593384, + "rewards/fitness_reward/mean": 6.098564147949219, + "rewards/fitness_reward/std": 1.779414176940918, + "rewards/kidney_reward/mean": -0.08578507602214813, + "rewards/kidney_reward/std": 1.4096269607543945, + "rewards/length2tails_reward/mean": 0.7863004803657532, + "rewards/length2tails_reward/std": 0.3195054829120636, + "rewards/thermo_reward/mean": -0.5979645252227783, + "rewards/thermo_reward/std": 1.8952313661575317, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15209331456571817, + "epoch": 0.756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3993508815765381, + "learning_rate": 3.5721239031346063e-07, + "loss": 0.004, + "num_tokens": 3311200.0, + "reward": 6.885627746582031, + "reward_std": 1.4935102462768555, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.03521929681301117, + "rewards/kidney_reward/std": 1.4940382242202759, + "rewards/length2tails_reward/mean": 0.8114314079284668, + "rewards/length2tails_reward/std": 0.22135481238365173, + "rewards/thermo_reward/mean": 0.42414015531539917, + "rewards/thermo_reward/std": 1.7999176979064941, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.53125, + "completions/mean_terminated_length": 269.53125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.11956506315618753, + "epoch": 0.758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5052749514579773, + "learning_rate": 3.518800989368691e-07, + "loss": -0.0054, + "num_tokens": 3319857.0, + "reward": 6.8014068603515625, + "reward_std": 1.4542784690856934, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.09797079861164093, + "rewards/kidney_reward/std": 1.2850091457366943, + "rewards/length2tails_reward/mean": 0.735588014125824, + "rewards/length2tails_reward/std": 0.2957015931606293, + "rewards/thermo_reward/mean": 0.356372594833374, + "rewards/thermo_reward/std": 1.8807733058929443, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 263.03125, + "completions/mean_terminated_length": 263.03125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.13626226875931025, + "epoch": 0.76, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4649805426597595, + "learning_rate": 3.465793960098945e-07, + "loss": -0.1314, + "num_tokens": 3328306.0, + "reward": 6.379859924316406, + "reward_std": 2.2455568313598633, + "rewards/fitness_reward/mean": 6.234853744506836, + "rewards/fitness_reward/std": 2.0163543224334717, + "rewards/kidney_reward/mean": -0.25092077255249023, + "rewards/kidney_reward/std": 1.4039207696914673, + "rewards/length2tails_reward/mean": 0.8077023029327393, + "rewards/length2tails_reward/std": 0.21923108398914337, + "rewards/thermo_reward/mean": 0.13708209991455078, + "rewards/thermo_reward/std": 1.9737155437469482, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 286.875, + "completions/mean_terminated_length": 286.875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.15023626573383808, + "epoch": 0.762, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.262868881225586, + "learning_rate": 3.4131053988131944e-07, + "loss": 0.1763, + "num_tokens": 3337518.0, + "reward": 6.012147903442383, + "reward_std": 2.5567662715911865, + "rewards/fitness_reward/mean": 5.7760162353515625, + "rewards/fitness_reward/std": 2.488330602645874, + "rewards/kidney_reward/mean": 0.29532673954963684, + "rewards/kidney_reward/std": 1.3476054668426514, + "rewards/length2tails_reward/mean": 0.8109618425369263, + "rewards/length2tails_reward/std": 0.2880936563014984, + "rewards/thermo_reward/mean": -0.22854575514793396, + "rewards/thermo_reward/std": 2.024935007095337, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 266.65625, + "completions/mean_terminated_length": 266.65625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.19405508507043123, + "epoch": 0.764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8079236149787903, + "learning_rate": 3.3607378734775837e-07, + "loss": -0.0455, + "num_tokens": 3346083.0, + "reward": 6.148028373718262, + "reward_std": 2.541191339492798, + "rewards/fitness_reward/mean": 5.762939453125, + "rewards/fitness_reward/std": 2.530473470687866, + "rewards/kidney_reward/mean": -0.24977697432041168, + "rewards/kidney_reward/std": 1.4287378787994385, + "rewards/length2tails_reward/mean": 0.8520157337188721, + "rewards/length2tails_reward/std": 0.20463883876800537, + "rewards/thermo_reward/mean": 0.5939469337463379, + "rewards/thermo_reward/std": 1.7890620231628418, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.1499976934865117, + "epoch": 0.766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8777826428413391, + "learning_rate": 3.308693936411421e-07, + "loss": -0.1306, + "num_tokens": 3354595.0, + "reward": 6.628800392150879, + "reward_std": 2.6668899059295654, + "rewards/fitness_reward/mean": 6.241194725036621, + "rewards/fitness_reward/std": 1.9804837703704834, + "rewards/kidney_reward/mean": 0.6248332262039185, + "rewards/kidney_reward/std": 1.411794900894165, + "rewards/length2tails_reward/mean": 0.7870715856552124, + "rewards/length2tails_reward/std": 0.24466806650161743, + "rewards/thermo_reward/mean": -0.2431577444076538, + "rewards/thermo_reward/std": 1.7822825908660889, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 265.9375, + "completions/mean_terminated_length": 265.9375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.14082825370132923, + "epoch": 0.768, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2032965421676636, + "learning_rate": 3.256976124162769e-07, + "loss": -0.0621, + "num_tokens": 3363137.0, + "reward": 6.046353340148926, + "reward_std": 3.363234519958496, + "rewards/fitness_reward/mean": 5.761190414428711, + "rewards/fitness_reward/std": 2.893301486968994, + "rewards/kidney_reward/mean": -0.21231193840503693, + "rewards/kidney_reward/std": 1.2381178140640259, + "rewards/length2tails_reward/mean": 0.7668105363845825, + "rewards/length2tails_reward/std": 0.3085991144180298, + "rewards/thermo_reward/mean": 0.39923232793807983, + "rewards/thermo_reward/std": 1.8276801109313965, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.3125, + "completions/mean_terminated_length": 270.3125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.13472178112715483, + "epoch": 0.77, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.536989450454712, + "learning_rate": 3.205586957384837e-07, + "loss": -0.0119, + "num_tokens": 3371819.0, + "reward": 7.058485984802246, + "reward_std": 2.452873945236206, + "rewards/fitness_reward/mean": 6.2134294509887695, + "rewards/fitness_reward/std": 2.1375479698181152, + "rewards/kidney_reward/mean": 0.4976133704185486, + "rewards/kidney_reward/std": 1.3583341836929321, + "rewards/length2tails_reward/mean": 0.7949094176292419, + "rewards/length2tails_reward/std": 0.30168935656547546, + "rewards/thermo_reward/mean": 0.7950452566146851, + "rewards/thermo_reward/std": 1.6629666090011597, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 273.96875, + "completions/mean_terminated_length": 273.96875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.16182382125407457, + "epoch": 0.772, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.567378282546997, + "learning_rate": 3.154528940713113e-07, + "loss": 0.0299, + "num_tokens": 3380618.0, + "reward": 6.462051868438721, + "reward_std": 2.986990451812744, + "rewards/fitness_reward/mean": 5.870641708374023, + "rewards/fitness_reward/std": 2.8402304649353027, + "rewards/kidney_reward/mean": 0.1701788753271103, + "rewards/kidney_reward/std": 1.3231279850006104, + "rewards/length2tails_reward/mean": 0.7905040979385376, + "rewards/length2tails_reward/std": 0.29682648181915283, + "rewards/thermo_reward/mean": 0.6173891425132751, + "rewards/thermo_reward/std": 1.581768274307251, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 279.5625, + "completions/mean_terminated_length": 279.5625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.1641734791919589, + "epoch": 0.774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5476036071777344, + "learning_rate": 3.103804562643302e-07, + "loss": 0.0692, + "num_tokens": 3389596.0, + "reward": 5.573647499084473, + "reward_std": 4.065038204193115, + "rewards/fitness_reward/mean": 5.163810729980469, + "rewards/fitness_reward/std": 3.8489232063293457, + "rewards/kidney_reward/mean": -0.38959816098213196, + "rewards/kidney_reward/std": 1.1808894872665405, + "rewards/length2tails_reward/mean": 0.9047496318817139, + "rewards/length2tails_reward/std": 0.14699599146842957, + "rewards/thermo_reward/mean": 0.7568966150283813, + "rewards/thermo_reward/std": 1.7143446207046509, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12199778575450182, + "epoch": 0.776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44223248958587646, + "learning_rate": 3.0534162954100263e-07, + "loss": 0.0034, + "num_tokens": 3398291.0, + "reward": 5.6734089851379395, + "reward_std": 2.7685587406158447, + "rewards/fitness_reward/mean": 5.9041361808776855, + "rewards/fitness_reward/std": 2.3458354473114014, + "rewards/kidney_reward/mean": -0.28289884328842163, + "rewards/kidney_reward/std": 1.5721582174301147, + "rewards/length2tails_reward/mean": 0.7498540878295898, + "rewards/length2tails_reward/std": 0.33232712745666504, + "rewards/thermo_reward/mean": -0.5534823536872864, + "rewards/thermo_reward/std": 2.1388351917266846, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 285.09375, + "completions/mean_terminated_length": 285.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1719574062153697, + "epoch": 0.778, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5087924003601074, + "learning_rate": 3.0033665948663446e-07, + "loss": 0.0468, + "num_tokens": 3407446.0, + "reward": 5.829162120819092, + "reward_std": 2.9063830375671387, + "rewards/fitness_reward/mean": 5.848479270935059, + "rewards/fitness_reward/std": 2.5653743743896484, + "rewards/kidney_reward/mean": -0.4199415445327759, + "rewards/kidney_reward/std": 1.430213451385498, + "rewards/length2tails_reward/mean": 0.8508896827697754, + "rewards/length2tails_reward/std": 0.21593841910362244, + "rewards/thermo_reward/mean": -0.04413709044456482, + "rewards/thermo_reward/std": 2.0255866050720215, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.13086803443729877, + "epoch": 0.78, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5239703059196472, + "learning_rate": 2.9536579003640527e-07, + "loss": 0.014, + "num_tokens": 3416108.0, + "reward": 6.765872955322266, + "reward_std": 1.354516863822937, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.16182968020439148, + "rewards/kidney_reward/std": 1.341886281967163, + "rewards/length2tails_reward/mean": 0.7907084226608276, + "rewards/length2tails_reward/std": 0.25706955790519714, + "rewards/thermo_reward/mean": -0.20803500711917877, + "rewards/thermo_reward/std": 2.00783109664917, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 274.8125, + "completions/mean_terminated_length": 274.8125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.13017123751342297, + "epoch": 0.782, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.406864881515503, + "learning_rate": 2.904292634634793e-07, + "loss": 0.0429, + "num_tokens": 3424934.0, + "reward": 5.226607799530029, + "reward_std": 3.46649432182312, + "rewards/fitness_reward/mean": 5.334480285644531, + "rewards/fitness_reward/std": 3.353198289871216, + "rewards/kidney_reward/mean": -0.2561941146850586, + "rewards/kidney_reward/std": 1.300828456878662, + "rewards/length2tails_reward/mean": 0.7608662247657776, + "rewards/length2tails_reward/std": 0.3362342417240143, + "rewards/thermo_reward/mean": -0.3399842083454132, + "rewards/thermo_reward/std": 1.951754093170166, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13058719597756863, + "epoch": 0.784, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5075511932373047, + "learning_rate": 2.8552732036719684e-07, + "loss": 0.0045, + "num_tokens": 3433585.0, + "reward": 6.7212090492248535, + "reward_std": 1.4675315618515015, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.09525611996650696, + "rewards/kidney_reward/std": 1.6818774938583374, + "rewards/length2tails_reward/mean": 0.720190167427063, + "rewards/length2tails_reward/std": 0.320951372385025, + "rewards/thermo_reward/mean": -0.19552983343601227, + "rewards/thermo_reward/std": 1.9775919914245605, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 269.40625, + "completions/mean_terminated_length": 269.40625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12077728472650051, + "epoch": 0.786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.332504004240036, + "learning_rate": 2.8066019966134904e-07, + "loss": -0.0, + "num_tokens": 3442238.0, + "reward": 6.944338798522949, + "reward_std": 1.4273179769515991, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.1750543713569641, + "rewards/kidney_reward/std": 1.3548657894134521, + "rewards/length2tails_reward/mean": 0.7342120409011841, + "rewards/length2tails_reward/std": 0.28480300307273865, + "rewards/thermo_reward/mean": 0.369899183511734, + "rewards/thermo_reward/std": 1.6564077138900757, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.1527182124555111, + "epoch": 0.788, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.405423164367676, + "learning_rate": 2.758281385625327e-07, + "loss": 0.0836, + "num_tokens": 3451164.0, + "reward": 6.0525336265563965, + "reward_std": 2.963632583618164, + "rewards/fitness_reward/mean": 5.772695541381836, + "rewards/fitness_reward/std": 2.850486993789673, + "rewards/kidney_reward/mean": -0.15535932779312134, + "rewards/kidney_reward/std": 1.2924243211746216, + "rewards/length2tails_reward/mean": 0.7899340987205505, + "rewards/length2tails_reward/std": 0.22973912954330444, + "rewards/thermo_reward/mean": 0.3200690448284149, + "rewards/thermo_reward/std": 1.8255976438522339, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 258.84375, + "completions/mean_terminated_length": 258.84375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "entropy": 0.14717209991067648, + "epoch": 0.79, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2791216373443604, + "learning_rate": 2.7103137257858863e-07, + "loss": -0.1568, + "num_tokens": 3459479.0, + "reward": 6.065085411071777, + "reward_std": 3.130974769592285, + "rewards/fitness_reward/mean": 5.851991653442383, + "rewards/fitness_reward/std": 2.909266948699951, + "rewards/kidney_reward/mean": 0.005277007818222046, + "rewards/kidney_reward/std": 1.342024564743042, + "rewards/length2tails_reward/mean": 0.7929951548576355, + "rewards/length2tails_reward/std": 0.24119798839092255, + "rewards/thermo_reward/mean": 0.024412035942077637, + "rewards/thermo_reward/std": 1.9675092697143555, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 281.5, + "completions/mean_terminated_length": 281.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.136367273516953, + "epoch": 0.792, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6745145916938782, + "learning_rate": 2.662701354971235e-07, + "loss": -0.0171, + "num_tokens": 3468519.0, + "reward": 6.343623161315918, + "reward_std": 1.3072080612182617, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.27624326944351196, + "rewards/kidney_reward/std": 1.1385921239852905, + "rewards/length2tails_reward/mean": 0.847183883190155, + "rewards/length2tails_reward/std": 0.2894735038280487, + "rewards/thermo_reward/mean": -0.23074282705783844, + "rewards/thermo_reward/std": 2.1387784481048584, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 275.71875, + "completions/mean_terminated_length": 275.71875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.17113555409014225, + "epoch": 0.794, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1915013790130615, + "learning_rate": 2.615446593741161e-07, + "loss": 0.0208, + "num_tokens": 3477374.0, + "reward": 7.070910453796387, + "reward_std": 1.5382869243621826, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.2943302392959595, + "rewards/kidney_reward/std": 1.565941333770752, + "rewards/length2tails_reward/mean": 0.8764208555221558, + "rewards/length2tails_reward/std": 0.21900318562984467, + "rewards/thermo_reward/mean": 0.43266230821609497, + "rewards/thermo_reward/std": 1.9067713022232056, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.14445937052369118, + "epoch": 0.796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3606030941009521, + "learning_rate": 2.568551745226056e-07, + "loss": -0.0315, + "num_tokens": 3485990.0, + "reward": 6.238191604614258, + "reward_std": 2.30902361869812, + "rewards/fitness_reward/mean": 6.231675148010254, + "rewards/fitness_reward/std": 2.0343332290649414, + "rewards/kidney_reward/mean": -0.2886021137237549, + "rewards/kidney_reward/std": 1.3605780601501465, + "rewards/length2tails_reward/mean": 0.777554452419281, + "rewards/length2tails_reward/std": 0.2704724073410034, + "rewards/thermo_reward/mean": -0.08714352548122406, + "rewards/thermo_reward/std": 1.8821381330490112, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14623204432427883, + "epoch": 0.798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5222352743148804, + "learning_rate": 2.5220190950146827e-07, + "loss": 0.0007, + "num_tokens": 3494746.0, + "reward": 6.362916946411133, + "reward_std": 2.5356452465057373, + "rewards/fitness_reward/mean": 6.1964030265808105, + "rewards/fitness_reward/std": 1.7332172393798828, + "rewards/kidney_reward/mean": 0.039355117827653885, + "rewards/kidney_reward/std": 1.2564246654510498, + "rewards/length2tails_reward/mean": 0.846625804901123, + "rewards/length2tails_reward/std": 0.27086248993873596, + "rewards/thermo_reward/mean": -0.12964069843292236, + "rewards/thermo_reward/std": 2.138139486312866, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 270.03125, + "completions/mean_terminated_length": 270.03125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.1653392855077982, + "epoch": 0.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1060235500335693, + "learning_rate": 2.4758509110427573e-07, + "loss": 0.0005, + "num_tokens": 3503419.0, + "reward": 5.930359840393066, + "reward_std": 3.656620979309082, + "rewards/fitness_reward/mean": 5.391902446746826, + "rewards/fitness_reward/std": 3.479548931121826, + "rewards/kidney_reward/mean": 0.09498319774866104, + "rewards/kidney_reward/std": 1.3903887271881104, + "rewards/length2tails_reward/mean": 0.817430317401886, + "rewards/length2tails_reward/std": 0.23707036674022675, + "rewards/thermo_reward/mean": 0.5732157230377197, + "rewards/thermo_reward/std": 1.8132132291793823, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 275.90625, + "completions/mean_terminated_length": 275.90625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13901573978364468, + "epoch": 0.802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8850361704826355, + "learning_rate": 2.430049443482437e-07, + "loss": 0.0059, + "num_tokens": 3512280.0, + "reward": 6.054816722869873, + "reward_std": 2.607663154602051, + "rewards/fitness_reward/mean": 6.111670017242432, + "rewards/fitness_reward/std": 2.1906065940856934, + "rewards/kidney_reward/mean": -0.3588145971298218, + "rewards/kidney_reward/std": 1.2051351070404053, + "rewards/length2tails_reward/mean": 0.7842074632644653, + "rewards/length2tails_reward/std": 0.32192572951316833, + "rewards/thermo_reward/mean": -0.1469959169626236, + "rewards/thermo_reward/std": 2.0782878398895264, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12296220194548368, + "epoch": 0.804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.912596583366394, + "learning_rate": 2.384616924632634e-07, + "loss": 0.0053, + "num_tokens": 3520975.0, + "reward": 6.582754135131836, + "reward_std": 1.5355113744735718, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.06718573719263077, + "rewards/kidney_reward/std": 1.3254574537277222, + "rewards/length2tails_reward/mean": 0.7945810556411743, + "rewards/length2tails_reward/std": 0.2715069651603699, + "rewards/thermo_reward/mean": 0.06476283073425293, + "rewards/thermo_reward/std": 1.9420396089553833, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 279.1875, + "completions/mean_terminated_length": 279.1875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.14521227590739727, + "epoch": 0.806, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1336517333984375, + "learning_rate": 2.339555568810221e-07, + "loss": 0.0963, + "num_tokens": 3529941.0, + "reward": 5.5686492919921875, + "reward_std": 3.8480067253112793, + "rewards/fitness_reward/mean": 5.522449493408203, + "rewards/fitness_reward/std": 3.398782968521118, + "rewards/kidney_reward/mean": -0.2542445957660675, + "rewards/kidney_reward/std": 1.4892573356628418, + "rewards/length2tails_reward/mean": 0.8749173283576965, + "rewards/length2tails_reward/std": 0.15462607145309448, + "rewards/thermo_reward/mean": -0.09081444144248962, + "rewards/thermo_reward/std": 1.7850987911224365, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 264.40625, + "completions/mean_terminated_length": 264.40625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.1401141695678234, + "epoch": 0.808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9227191209793091, + "learning_rate": 2.2948675722421085e-07, + "loss": -0.1072, + "num_tokens": 3538434.0, + "reward": 6.522500038146973, + "reward_std": 3.04559588432312, + "rewards/fitness_reward/mean": 5.933748245239258, + "rewards/fitness_reward/std": 2.5987942218780518, + "rewards/kidney_reward/mean": 0.2467244416475296, + "rewards/kidney_reward/std": 1.4387593269348145, + "rewards/length2tails_reward/mean": 0.7946808934211731, + "rewards/length2tails_reward/std": 0.28842639923095703, + "rewards/thermo_reward/mean": 0.5334376096725464, + "rewards/thermo_reward/std": 1.7454192638397217, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1552875665947795, + "epoch": 0.81, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8183427453041077, + "learning_rate": 2.2505551129582046e-07, + "loss": 0.0279, + "num_tokens": 3547206.0, + "reward": 6.446796417236328, + "reward_std": 3.105996608734131, + "rewards/fitness_reward/mean": 5.8662495613098145, + "rewards/fitness_reward/std": 2.4916253089904785, + "rewards/kidney_reward/mean": 0.27844473719596863, + "rewards/kidney_reward/std": 1.3675066232681274, + "rewards/length2tails_reward/mean": 0.8420401811599731, + "rewards/length2tails_reward/std": 0.23425650596618652, + "rewards/thermo_reward/mean": 0.46162861585617065, + "rewards/thermo_reward/std": 1.8574618101119995, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13848723284900188, + "epoch": 0.812, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2599036693572998, + "learning_rate": 2.2066203506852564e-07, + "loss": 0.0547, + "num_tokens": 3556086.0, + "reward": 6.115935802459717, + "reward_std": 2.4688777923583984, + "rewards/fitness_reward/mean": 5.774024486541748, + "rewards/fitness_reward/std": 2.4868690967559814, + "rewards/kidney_reward/mean": 0.08403107523918152, + "rewards/kidney_reward/std": 1.442877173423767, + "rewards/length2tails_reward/mean": 0.7783209085464478, + "rewards/length2tails_reward/std": 0.3148162364959717, + "rewards/thermo_reward/mean": 0.21063126623630524, + "rewards/thermo_reward/std": 1.992857575416565, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 271.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1279955366626382, + "epoch": 0.814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7296998500823975, + "learning_rate": 2.1630654267416026e-07, + "loss": 0.0067, + "num_tokens": 3564817.0, + "reward": 5.637965202331543, + "reward_std": 3.0680017471313477, + "rewards/fitness_reward/mean": 5.852019309997559, + "rewards/fitness_reward/std": 2.9153757095336914, + "rewards/kidney_reward/mean": -0.3093605637550354, + "rewards/kidney_reward/std": 1.314058780670166, + "rewards/length2tails_reward/mean": 0.8313709497451782, + "rewards/length2tails_reward/std": 0.2533099949359894, + "rewards/thermo_reward/mean": -0.5344333648681641, + "rewards/thermo_reward/std": 2.188082695007324, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 266.8125, + "completions/mean_terminated_length": 266.8125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.14575813338160515, + "epoch": 0.816, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0772520303726196, + "learning_rate": 2.1198924639327808e-07, + "loss": -0.0441, + "num_tokens": 3573387.0, + "reward": 6.009997367858887, + "reward_std": 2.9660511016845703, + "rewards/fitness_reward/mean": 5.847411155700684, + "rewards/fitness_reward/std": 2.5736851692199707, + "rewards/kidney_reward/mean": -0.3084346354007721, + "rewards/kidney_reward/std": 1.4720526933670044, + "rewards/length2tails_reward/mean": 0.8826963901519775, + "rewards/length2tails_reward/std": 0.2083863615989685, + "rewards/thermo_reward/mean": 0.19225841760635376, + "rewards/thermo_reward/std": 1.89582359790802, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 267.78125, + "completions/mean_terminated_length": 267.78125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.1622565258294344, + "epoch": 0.818, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0969433784484863, + "learning_rate": 2.077103566448094e-07, + "loss": -0.0479, + "num_tokens": 3581988.0, + "reward": 6.01772403717041, + "reward_std": 3.2350380420684814, + "rewards/fitness_reward/mean": 5.869783401489258, + "rewards/fitness_reward/std": 2.8406243324279785, + "rewards/kidney_reward/mean": 0.10627858340740204, + "rewards/kidney_reward/std": 1.4346028566360474, + "rewards/length2tails_reward/mean": 0.8081914186477661, + "rewards/length2tails_reward/std": 0.26256439089775085, + "rewards/thermo_reward/mean": -0.21449324488639832, + "rewards/thermo_reward/std": 2.0515637397766113, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.12715268693864346, + "epoch": 0.82, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2996414303779602, + "learning_rate": 2.0347008197580372e-07, + "loss": 0.0048, + "num_tokens": 3590660.0, + "reward": 6.63413667678833, + "reward_std": 1.4439584016799927, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.29404324293136597, + "rewards/kidney_reward/std": 1.3528603315353394, + "rewards/length2tails_reward/mean": 0.74732905626297, + "rewards/length2tails_reward/std": 0.3145018517971039, + "rewards/thermo_reward/mean": -0.37605229020118713, + "rewards/thermo_reward/std": 2.162081241607666, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 278.21875, + "completions/mean_terminated_length": 278.21875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.19567361939698458, + "epoch": 0.822, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5846195220947266, + "learning_rate": 1.9926862905126663e-07, + "loss": 0.0862, + "num_tokens": 3599595.0, + "reward": 6.507219314575195, + "reward_std": 2.83516788482666, + "rewards/fitness_reward/mean": 5.774267196655273, + "rewards/fitness_reward/std": 2.844433546066284, + "rewards/kidney_reward/mean": 0.39035794138908386, + "rewards/kidney_reward/std": 1.258296012878418, + "rewards/length2tails_reward/mean": 0.7463816404342651, + "rewards/length2tails_reward/std": 0.27894169092178345, + "rewards/thermo_reward/mean": 0.702355146408081, + "rewards/thermo_reward/std": 1.6959781646728516, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 273.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13486727699637413, + "epoch": 0.824, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4812967777252197, + "learning_rate": 1.9510620264408594e-07, + "loss": 0.0737, + "num_tokens": 3608380.0, + "reward": 6.2867326736450195, + "reward_std": 2.4341838359832764, + "rewards/fitness_reward/mean": 6.129810333251953, + "rewards/fitness_reward/std": 2.0918588638305664, + "rewards/kidney_reward/mean": -0.017805874347686768, + "rewards/kidney_reward/std": 1.2995150089263916, + "rewards/length2tails_reward/mean": 0.6775492429733276, + "rewards/length2tails_reward/std": 0.3680454194545746, + "rewards/thermo_reward/mean": -0.007123976945877075, + "rewards/thermo_reward/std": 1.8101736307144165, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.12293026130646467, + "epoch": 0.826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8246999382972717, + "learning_rate": 1.9098300562505264e-07, + "loss": -0.0029, + "num_tokens": 3617104.0, + "reward": 6.208252906799316, + "reward_std": 1.9879117012023926, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": 0.3767698407173157, + "rewards/kidney_reward/std": 1.368463397026062, + "rewards/length2tails_reward/mean": 0.8074749708175659, + "rewards/length2tails_reward/std": 0.30307063460350037, + "rewards/thermo_reward/mean": -0.7226856350898743, + "rewards/thermo_reward/std": 2.1550796031951904, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 274.21875, + "completions/mean_terminated_length": 274.21875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14080187398940325, + "epoch": 0.828, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4719889461994171, + "learning_rate": 1.8689923895297244e-07, + "loss": 0.0025, + "num_tokens": 3625911.0, + "reward": 6.903785705566406, + "reward_std": 1.76779305934906, + "rewards/fitness_reward/mean": 6.2823309898376465, + "rewards/fitness_reward/std": 0.9759886264801025, + "rewards/kidney_reward/mean": 0.25635695457458496, + "rewards/kidney_reward/std": 1.4129506349563599, + "rewards/length2tails_reward/mean": 0.7412567734718323, + "rewards/length2tails_reward/std": 0.2820318043231964, + "rewards/thermo_reward/mean": 0.6159244775772095, + "rewards/thermo_reward/std": 1.7575407028198242, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14221271220594645, + "epoch": 0.83, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3563969135284424, + "learning_rate": 1.828551016648715e-07, + "loss": -0.0005, + "num_tokens": 3634626.0, + "reward": 6.651411056518555, + "reward_std": 2.018911838531494, + "rewards/fitness_reward/mean": 6.2043962478637695, + "rewards/fitness_reward/std": 1.6906987428665161, + "rewards/kidney_reward/mean": 0.12795323133468628, + "rewards/kidney_reward/std": 1.3059492111206055, + "rewards/length2tails_reward/mean": 0.8251470327377319, + "rewards/length2tails_reward/std": 0.25611862540245056, + "rewards/thermo_reward/mean": 0.3535034656524658, + "rewards/thermo_reward/std": 1.824897289276123, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12413936480879784, + "epoch": 0.832, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3519450426101685, + "learning_rate": 1.7885079086629596e-07, + "loss": -0.0, + "num_tokens": 3643309.0, + "reward": 6.099939346313477, + "reward_std": 2.4669342041015625, + "rewards/fitness_reward/mean": 6.108650207519531, + "rewards/fitness_reward/std": 2.2070798873901367, + "rewards/kidney_reward/mean": -0.1856500655412674, + "rewards/kidney_reward/std": 1.055111289024353, + "rewards/length2tails_reward/mean": 0.8407902717590332, + "rewards/length2tails_reward/std": 0.207689568400383, + "rewards/thermo_reward/mean": -0.2521669864654541, + "rewards/thermo_reward/std": 1.982387661933899, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13434513751417398, + "epoch": 0.834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5159134268760681, + "learning_rate": 1.7488650172170493e-07, + "loss": 0.0032, + "num_tokens": 3651993.0, + "reward": 6.955452919006348, + "reward_std": 1.0383720397949219, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.2530665397644043, + "rewards/kidney_reward/std": 1.161150336265564, + "rewards/length2tails_reward/mean": 0.7715202569961548, + "rewards/length2tails_reward/std": 0.2859238386154175, + "rewards/thermo_reward/mean": 0.5956157445907593, + "rewards/thermo_reward/std": 1.7789171934127808, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 269.21875, + "completions/mean_terminated_length": 269.21875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12625315971672535, + "epoch": 0.836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3186589479446411, + "learning_rate": 1.7096242744495838e-07, + "loss": -0.0, + "num_tokens": 3660640.0, + "reward": 6.766183853149414, + "reward_std": 1.2357929944992065, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.09876012057065964, + "rewards/kidney_reward/std": 1.3812872171401978, + "rewards/length2tails_reward/mean": 0.683132529258728, + "rewards/length2tails_reward/std": 0.3223157525062561, + "rewards/thermo_reward/mean": 0.10696518421173096, + "rewards/thermo_reward/std": 1.9921863079071045, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.1875, + "completions/mean_terminated_length": 270.1875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13841083087027073, + "epoch": 0.838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7905070185661316, + "learning_rate": 1.6707875928990056e-07, + "loss": 0.0008, + "num_tokens": 3669318.0, + "reward": 6.6396074295043945, + "reward_std": 1.6409484148025513, + "rewards/fitness_reward/mean": 6.316043376922607, + "rewards/fitness_reward/std": 1.5570749044418335, + "rewards/kidney_reward/mean": -0.39684462547302246, + "rewards/kidney_reward/std": 1.2976994514465332, + "rewards/length2tails_reward/mean": 0.8042482137680054, + "rewards/length2tails_reward/std": 0.1898868829011917, + "rewards/thermo_reward/mean": 0.6418485045433044, + "rewards/thermo_reward/std": 1.550958514213562, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 273.78125, + "completions/mean_terminated_length": 273.78125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14957595337182283, + "epoch": 0.84, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3791979551315308, + "learning_rate": 1.6323568654103837e-07, + "loss": 0.0222, + "num_tokens": 3678111.0, + "reward": 5.700117588043213, + "reward_std": 3.136606454849243, + "rewards/fitness_reward/mean": 5.612381935119629, + "rewards/fitness_reward/std": 3.1102912425994873, + "rewards/kidney_reward/mean": 0.09526537358760834, + "rewards/kidney_reward/std": 1.4866007566452026, + "rewards/length2tails_reward/mean": 0.8351494073867798, + "rewards/length2tails_reward/std": 0.23856548964977264, + "rewards/thermo_reward/mean": -0.3373691439628601, + "rewards/thermo_reward/std": 2.122860908508301, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 272.59375, + "completions/mean_terminated_length": 272.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13295627105981112, + "epoch": 0.842, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.530700445175171, + "learning_rate": 1.5943339650431574e-07, + "loss": 0.0591, + "num_tokens": 3686866.0, + "reward": 6.56540584564209, + "reward_std": 2.4985995292663574, + "rewards/fitness_reward/mean": 6.224242210388184, + "rewards/fitness_reward/std": 2.076380491256714, + "rewards/kidney_reward/mean": 0.10974864661693573, + "rewards/kidney_reward/std": 1.4650508165359497, + "rewards/length2tails_reward/mean": 0.7296964526176453, + "rewards/length2tails_reward/std": 0.291138231754303, + "rewards/thermo_reward/mean": 0.20773005485534668, + "rewards/thermo_reward/std": 1.9620627164840698, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.6875, + "completions/mean_terminated_length": 269.6875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.11998305935412645, + "epoch": 0.844, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5405923128128052, + "learning_rate": 1.5567207449798515e-07, + "loss": -0.0009, + "num_tokens": 3695528.0, + "reward": 6.607175827026367, + "reward_std": 1.4843957424163818, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.03256216645240784, + "rewards/kidney_reward/std": 1.2762181758880615, + "rewards/length2tails_reward/mean": 0.7198657393455505, + "rewards/length2tails_reward/std": 0.3216610252857208, + "rewards/thermo_reward/mean": -0.08963754773139954, + "rewards/thermo_reward/std": 2.1419501304626465, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 280.5, + "completions/mean_terminated_length": 280.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.17389845848083496, + "epoch": 0.846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9274736642837524, + "learning_rate": 1.5195190384357404e-07, + "loss": 0.086, + "num_tokens": 3704536.0, + "reward": 5.957474231719971, + "reward_std": 2.9784088134765625, + "rewards/fitness_reward/mean": 5.606304168701172, + "rewards/fitness_reward/std": 3.129211187362671, + "rewards/kidney_reward/mean": -0.2096947282552719, + "rewards/kidney_reward/std": 1.2081297636032104, + "rewards/length2tails_reward/mean": 0.8495426774024963, + "rewards/length2tails_reward/std": 0.19200514256954193, + "rewards/thermo_reward/mean": 0.4872628450393677, + "rewards/thermo_reward/std": 1.6905598640441895, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 280.1875, + "completions/mean_terminated_length": 280.1875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.14840978104621172, + "epoch": 0.848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.710753321647644, + "learning_rate": 1.4827306585695233e-07, + "loss": 0.0106, + "num_tokens": 3713534.0, + "reward": 6.249197006225586, + "reward_std": 2.3880364894866943, + "rewards/fitness_reward/mean": 6.113107681274414, + "rewards/fitness_reward/std": 2.1827657222747803, + "rewards/kidney_reward/mean": 0.1490403264760971, + "rewards/kidney_reward/std": 1.3688663244247437, + "rewards/length2tails_reward/mean": 0.7044593691825867, + "rewards/length2tails_reward/std": 0.3274818956851959, + "rewards/thermo_reward/mean": -0.2290917932987213, + "rewards/thermo_reward/std": 1.8039517402648926, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1391929117962718, + "epoch": 0.85, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7323298454284668, + "learning_rate": 1.446357398394934e-07, + "loss": 0.0019, + "num_tokens": 3722202.0, + "reward": 6.236641883850098, + "reward_std": 3.057166814804077, + "rewards/fitness_reward/mean": 5.855730056762695, + "rewards/fitness_reward/std": 2.538175344467163, + "rewards/kidney_reward/mean": -0.0438118651509285, + "rewards/kidney_reward/std": 1.222165584564209, + "rewards/length2tails_reward/mean": 0.7789819240570068, + "rewards/length2tails_reward/std": 0.28580957651138306, + "rewards/thermo_reward/mean": 0.41614454984664917, + "rewards/thermo_reward/std": 1.7913116216659546, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 268.46875, + "completions/mean_terminated_length": 268.46875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.15740068443119526, + "epoch": 0.852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7037556171417236, + "learning_rate": 1.4104010306933555e-07, + "loss": -0.0547, + "num_tokens": 3730825.0, + "reward": 6.620261192321777, + "reward_std": 2.26141357421875, + "rewards/fitness_reward/mean": 6.2323808670043945, + "rewards/fitness_reward/std": 2.030343770980835, + "rewards/kidney_reward/mean": -0.044921278953552246, + "rewards/kidney_reward/std": 1.3782581090927124, + "rewards/length2tails_reward/mean": 0.8388746380805969, + "rewards/length2tails_reward/std": 0.23595750331878662, + "rewards/thermo_reward/mean": 0.4012451767921448, + "rewards/thermo_reward/std": 1.9409761428833008, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 274.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15207398403435946, + "epoch": 0.854, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2341874837875366, + "learning_rate": 1.3748633079274253e-07, + "loss": -0.0011, + "num_tokens": 3739644.0, + "reward": 6.3321919441223145, + "reward_std": 2.150449275970459, + "rewards/fitness_reward/mean": 6.1863789558410645, + "rewards/fitness_reward/std": 1.7867270708084106, + "rewards/kidney_reward/mean": 0.05245739221572876, + "rewards/kidney_reward/std": 1.3610572814941406, + "rewards/length2tails_reward/mean": 0.8321821093559265, + "rewards/length2tails_reward/std": 0.24851974844932556, + "rewards/thermo_reward/mean": -0.17692288756370544, + "rewards/thermo_reward/std": 2.0968687534332275, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.15625, + "completions/mean_terminated_length": 270.15625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11885499861091375, + "epoch": 0.856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3401205241680145, + "learning_rate": 1.3397459621556128e-07, + "loss": -0.0018, + "num_tokens": 3748321.0, + "reward": 6.152368545532227, + "reward_std": 1.571397066116333, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": -0.011298850178718567, + "rewards/kidney_reward/std": 1.3151267766952515, + "rewards/length2tails_reward/mean": 0.7107293009757996, + "rewards/length2tails_reward/std": 0.36373162269592285, + "rewards/thermo_reward/mean": -0.3980119228363037, + "rewards/thermo_reward/std": 2.023512363433838, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 265.8125, + "completions/mean_terminated_length": 265.8125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.12992818094789982, + "epoch": 0.858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.096072793006897, + "learning_rate": 1.30505070494781e-07, + "loss": -0.0726, + "num_tokens": 3756859.0, + "reward": 5.837516784667969, + "reward_std": 2.693760633468628, + "rewards/fitness_reward/mean": 5.796808242797852, + "rewards/fitness_reward/std": 2.362955331802368, + "rewards/kidney_reward/mean": -0.5988284945487976, + "rewards/kidney_reward/std": 1.1490215063095093, + "rewards/length2tails_reward/mean": 0.7456122636795044, + "rewards/length2tails_reward/std": 0.315470427274704, + "rewards/thermo_reward/mean": 0.30743902921676636, + "rewards/thermo_reward/std": 1.7514585256576538, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 278.9375, + "completions/mean_terminated_length": 278.9375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1424407884478569, + "epoch": 0.86, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0920772552490234, + "learning_rate": 1.2707792273019047e-07, + "loss": 0.1266, + "num_tokens": 3765817.0, + "reward": 6.199623107910156, + "reward_std": 2.7672059535980225, + "rewards/fitness_reward/mean": 5.834479808807373, + "rewards/fitness_reward/std": 2.6360130310058594, + "rewards/kidney_reward/mean": 0.03671124577522278, + "rewards/kidney_reward/std": 1.3138879537582397, + "rewards/length2tails_reward/mean": 0.7895512580871582, + "rewards/length2tails_reward/std": 0.25905853509902954, + "rewards/thermo_reward/mean": 0.2987987995147705, + "rewards/thermo_reward/std": 1.9619476795196533, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13846470788121223, + "epoch": 0.862, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2143913507461548, + "learning_rate": 1.2369331995613663e-07, + "loss": -0.0064, + "num_tokens": 3774514.0, + "reward": 6.380249977111816, + "reward_std": 1.4921387434005737, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.5315516591072083, + "rewards/kidney_reward/std": 1.4306668043136597, + "rewards/length2tails_reward/mean": 0.7784464359283447, + "rewards/length2tails_reward/std": 0.31882283091545105, + "rewards/thermo_reward/mean": 0.13218817114830017, + "rewards/thermo_reward/std": 1.781838059425354, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.375, + "completions/mean_terminated_length": 270.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1336890598759055, + "epoch": 0.864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4450879693031311, + "learning_rate": 1.2035142713338363e-07, + "loss": -0.0037, + "num_tokens": 3783198.0, + "reward": 6.036025524139404, + "reward_std": 2.70249605178833, + "rewards/fitness_reward/mean": 5.733524322509766, + "rewards/fitness_reward/std": 2.6453893184661865, + "rewards/kidney_reward/mean": -0.06320416927337646, + "rewards/kidney_reward/std": 1.3764325380325317, + "rewards/length2tails_reward/mean": 0.6983875036239624, + "rewards/length2tails_reward/std": 0.3537623882293701, + "rewards/thermo_reward/mean": 0.31901222467422485, + "rewards/thermo_reward/std": 1.663163185119629, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 262.09375, + "completions/mean_terminated_length": 262.09375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.14609050378203392, + "epoch": 0.866, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9099003076553345, + "learning_rate": 1.1705240714107301e-07, + "loss": -0.0978, + "num_tokens": 3791617.0, + "reward": 6.100383758544922, + "reward_std": 2.9282727241516113, + "rewards/fitness_reward/mean": 5.8608198165893555, + "rewards/fitness_reward/std": 2.8744940757751465, + "rewards/kidney_reward/mean": 0.06438690423965454, + "rewards/kidney_reward/std": 1.2841964960098267, + "rewards/length2tails_reward/mean": 0.7604968547821045, + "rewards/length2tails_reward/std": 0.29300975799560547, + "rewards/thermo_reward/mean": 0.03449193388223648, + "rewards/thermo_reward/std": 2.0563175678253174, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 276.96875, + "completions/mean_terminated_length": 276.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14405822847038507, + "epoch": 0.868, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8847280740737915, + "learning_rate": 1.1379642076878526e-07, + "loss": 0.0476, + "num_tokens": 3800512.0, + "reward": 5.500676155090332, + "reward_std": 2.674482822418213, + "rewards/fitness_reward/mean": 5.6174211502075195, + "rewards/fitness_reward/std": 2.727896213531494, + "rewards/kidney_reward/mean": -0.15667006373405457, + "rewards/kidney_reward/std": 1.4270250797271729, + "rewards/length2tails_reward/mean": 0.7182776927947998, + "rewards/length2tails_reward/std": 0.3376780152320862, + "rewards/thermo_reward/mean": -0.43595871329307556, + "rewards/thermo_reward/std": 2.140556812286377, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 270.96875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.14862105157226324, + "epoch": 0.87, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.339766025543213, + "learning_rate": 1.1058362670870247e-07, + "loss": -0.0016, + "num_tokens": 3809215.0, + "reward": 5.193296432495117, + "reward_std": 3.8562912940979004, + "rewards/fitness_reward/mean": 5.236565589904785, + "rewards/fitness_reward/std": 3.654449701309204, + "rewards/kidney_reward/mean": -0.010497570037841797, + "rewards/kidney_reward/std": 1.438853144645691, + "rewards/length2tails_reward/mean": 0.7963101267814636, + "rewards/length2tails_reward/std": 0.26542168855667114, + "rewards/thermo_reward/mean": -0.4741969704627991, + "rewards/thermo_reward/std": 2.142672061920166, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 282.46875, + "completions/mean_terminated_length": 282.46875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.15248755365610123, + "epoch": 0.872, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.1925106048583984, + "learning_rate": 1.074141815478744e-07, + "loss": 0.1164, + "num_tokens": 3818286.0, + "reward": 5.293362140655518, + "reward_std": 3.658482551574707, + "rewards/fitness_reward/mean": 5.286756992340088, + "rewards/fitness_reward/std": 3.4996767044067383, + "rewards/kidney_reward/mean": -0.034492507576942444, + "rewards/kidney_reward/std": 1.3296836614608765, + "rewards/length2tails_reward/mean": 0.8726150989532471, + "rewards/length2tails_reward/std": 0.24276185035705566, + "rewards/thermo_reward/mean": -0.3886043429374695, + "rewards/thermo_reward/std": 1.8644394874572754, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.13518050219863653, + "epoch": 0.874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6139440536499023, + "learning_rate": 1.0428823976058709e-07, + "loss": -0.0233, + "num_tokens": 3826942.0, + "reward": 6.495221138000488, + "reward_std": 2.355567216873169, + "rewards/fitness_reward/mean": 6.006479263305664, + "rewards/fitness_reward/std": 2.2557687759399414, + "rewards/kidney_reward/mean": 0.07885207235813141, + "rewards/kidney_reward/std": 1.2913696765899658, + "rewards/length2tails_reward/mean": 0.7283536195755005, + "rewards/length2tails_reward/std": 0.3239452540874481, + "rewards/thermo_reward/mean": 0.5344558954238892, + "rewards/thermo_reward/std": 1.7547905445098877, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.11761743109673262, + "epoch": 0.876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33359822630882263, + "learning_rate": 1.0120595370083318e-07, + "loss": 0.0066, + "num_tokens": 3835650.0, + "reward": 6.277066230773926, + "reward_std": 1.4569121599197388, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.38881829380989075, + "rewards/kidney_reward/std": 1.234520673751831, + "rewards/length2tails_reward/mean": 0.8334996700286865, + "rewards/length2tails_reward/std": 0.22490544617176056, + "rewards/thermo_reward/mean": -0.45041751861572266, + "rewards/thermo_reward/std": 2.2308945655822754, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.14700100850313902, + "epoch": 0.878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8419653177261353, + "learning_rate": 9.81674735948863e-08, + "loss": 0.0156, + "num_tokens": 3844410.0, + "reward": 6.325191020965576, + "reward_std": 2.3530921936035156, + "rewards/fitness_reward/mean": 6.140649795532227, + "rewards/fitness_reward/std": 2.0330381393432617, + "rewards/kidney_reward/mean": -0.03410997986793518, + "rewards/kidney_reward/std": 1.2392289638519287, + "rewards/length2tails_reward/mean": 0.8348275423049927, + "rewards/length2tails_reward/std": 0.2512573301792145, + "rewards/thermo_reward/mean": -0.014221221208572388, + "rewards/thermo_reward/std": 1.937234878540039, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 261.28125, + "completions/mean_terminated_length": 261.28125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.14893424790352583, + "epoch": 0.88, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.867492437362671, + "learning_rate": 9.517294753398064e-08, + "loss": -0.1308, + "num_tokens": 3852803.0, + "reward": 5.713663101196289, + "reward_std": 3.0692226886749268, + "rewards/fitness_reward/mean": 5.540594100952148, + "rewards/fitness_reward/std": 3.004016637802124, + "rewards/kidney_reward/mean": -0.04113885760307312, + "rewards/kidney_reward/std": 1.2972465753555298, + "rewards/length2tails_reward/mean": 0.8506132960319519, + "rewards/length2tails_reward/std": 0.2541244924068451, + "rewards/thermo_reward/mean": -0.038030415773391724, + "rewards/thermo_reward/std": 1.9679735898971558, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 282.1875, + "completions/mean_terminated_length": 282.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.154700574465096, + "epoch": 0.882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5670086741447449, + "learning_rate": 9.222252146709142e-08, + "loss": -0.0254, + "num_tokens": 3861865.0, + "reward": 6.069726943969727, + "reward_std": 2.668290138244629, + "rewards/fitness_reward/mean": 6.112736701965332, + "rewards/fitness_reward/std": 2.184788465499878, + "rewards/kidney_reward/mean": -0.46153780817985535, + "rewards/kidney_reward/std": 1.1795583963394165, + "rewards/length2tails_reward/mean": 0.8094608187675476, + "rewards/length2tails_reward/std": 0.25654107332229614, + "rewards/thermo_reward/mean": -0.029212698340415955, + "rewards/thermo_reward/std": 1.948235273361206, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 275.59375, + "completions/mean_terminated_length": 275.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1533412327989936, + "epoch": 0.884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6889277696609497, + "learning_rate": 8.931633919382298e-08, + "loss": 0.0657, + "num_tokens": 3870716.0, + "reward": 6.111047267913818, + "reward_std": 2.321079730987549, + "rewards/fitness_reward/mean": 6.138444900512695, + "rewards/fitness_reward/std": 2.044992208480835, + "rewards/kidney_reward/mean": -0.30765989422798157, + "rewards/kidney_reward/std": 1.4663869142532349, + "rewards/length2tails_reward/mean": 0.8048216104507446, + "rewards/length2tails_reward/std": 0.2839038372039795, + "rewards/thermo_reward/mean": -0.1495458483695984, + "rewards/thermo_reward/std": 2.2613861560821533, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 284.5, + "completions/mean_terminated_length": 284.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14972092770040035, + "epoch": 0.886, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.146921634674072, + "learning_rate": 8.645454235739902e-08, + "loss": 0.1805, + "num_tokens": 3879852.0, + "reward": 6.23973274230957, + "reward_std": 3.0467097759246826, + "rewards/fitness_reward/mean": 5.754415035247803, + "rewards/fitness_reward/std": 2.9195473194122314, + "rewards/kidney_reward/mean": 0.18438826501369476, + "rewards/kidney_reward/std": 1.3486359119415283, + "rewards/length2tails_reward/mean": 0.715106725692749, + "rewards/length2tails_reward/std": 0.3136950433254242, + "rewards/thermo_reward/mean": 0.4286932945251465, + "rewards/thermo_reward/std": 1.8948018550872803, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.16300893109291792, + "epoch": 0.888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.321374535560608, + "learning_rate": 8.363727043776036e-08, + "loss": -0.0074, + "num_tokens": 3888513.0, + "reward": 5.419971466064453, + "reward_std": 3.4583020210266113, + "rewards/fitness_reward/mean": 5.180810928344727, + "rewards/fitness_reward/std": 3.550640344619751, + "rewards/kidney_reward/mean": 0.14285969734191895, + "rewards/kidney_reward/std": 1.2835968732833862, + "rewards/length2tails_reward/mean": 0.7409695386886597, + "rewards/length2tails_reward/std": 0.32659363746643066, + "rewards/thermo_reward/mean": -0.03502354770898819, + "rewards/thermo_reward/std": 1.8435451984405518, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.4375, + "completions/mean_terminated_length": 270.4375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13684740848839283, + "epoch": 0.89, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5912813544273376, + "learning_rate": 8.086466074476562e-08, + "loss": 0.0048, + "num_tokens": 3897199.0, + "reward": 6.76379919052124, + "reward_std": 2.2146174907684326, + "rewards/fitness_reward/mean": 6.315851211547852, + "rewards/fitness_reward/std": 1.5581613779067993, + "rewards/kidney_reward/mean": 0.2498674839735031, + "rewards/kidney_reward/std": 1.1828155517578125, + "rewards/length2tails_reward/mean": 0.7916619181632996, + "rewards/length2tails_reward/std": 0.26749032735824585, + "rewards/thermo_reward/mean": 0.2501968443393707, + "rewards/thermo_reward/std": 1.9461218118667603, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 267.15625, + "completions/mean_terminated_length": 267.15625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.14623074512928724, + "epoch": 0.892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.829219102859497, + "learning_rate": 7.813684841149959e-08, + "loss": -0.0739, + "num_tokens": 3905780.0, + "reward": 6.439332008361816, + "reward_std": 2.433130979537964, + "rewards/fitness_reward/mean": 6.125979423522949, + "rewards/fitness_reward/std": 2.1126816272735596, + "rewards/kidney_reward/mean": -0.09445090591907501, + "rewards/kidney_reward/std": 1.271247148513794, + "rewards/length2tails_reward/mean": 0.8165769577026367, + "rewards/length2tails_reward/std": 0.28393563628196716, + "rewards/thermo_reward/mean": 0.3128669559955597, + "rewards/thermo_reward/std": 1.8542346954345703, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15820543095469475, + "epoch": 0.894, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0217692852020264, + "learning_rate": 7.545396638768697e-08, + "loss": 0.016, + "num_tokens": 3914548.0, + "reward": 6.446044445037842, + "reward_std": 2.6220896244049072, + "rewards/fitness_reward/mean": 6.2497239112854, + "rewards/fitness_reward/std": 1.9322361946105957, + "rewards/kidney_reward/mean": -0.11043130606412888, + "rewards/kidney_reward/std": 1.3969236612319946, + "rewards/length2tails_reward/mean": 0.8614251017570496, + "rewards/length2tails_reward/std": 0.22737865149974823, + "rewards/thermo_reward/mean": 0.07236060500144958, + "rewards/thermo_reward/std": 2.22379732131958, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 263.5625, + "completions/mean_terminated_length": 263.5625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.1258983640000224, + "epoch": 0.896, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359396457672119, + "learning_rate": 7.281614543321269e-08, + "loss": -0.0726, + "num_tokens": 3923014.0, + "reward": 6.619543552398682, + "reward_std": 1.9513977766036987, + "rewards/fitness_reward/mean": 6.315708637237549, + "rewards/fitness_reward/std": 1.5589702129364014, + "rewards/kidney_reward/mean": 0.054669540375471115, + "rewards/kidney_reward/std": 1.4212532043457031, + "rewards/length2tails_reward/mean": 0.7442126274108887, + "rewards/length2tails_reward/std": 0.2968263030052185, + "rewards/thermo_reward/mean": 0.18089357018470764, + "rewards/thermo_reward/std": 1.9039746522903442, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.15910894237458706, + "epoch": 0.898, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.005949020385742, + "learning_rate": 7.022351411174865e-08, + "loss": 0.0452, + "num_tokens": 3931794.0, + "reward": 6.604391098022461, + "reward_std": 2.1950721740722656, + "rewards/fitness_reward/mean": 6.22106409072876, + "rewards/fitness_reward/std": 2.0943596363067627, + "rewards/kidney_reward/mean": 0.1045638918876648, + "rewards/kidney_reward/std": 1.3927233219146729, + "rewards/length2tails_reward/mean": 0.8200255632400513, + "rewards/length2tails_reward/std": 0.20488189160823822, + "rewards/thermo_reward/mean": 0.25207698345184326, + "rewards/thermo_reward/std": 1.908453106880188, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.28125, + "completions/mean_terminated_length": 272.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.12655364908277988, + "epoch": 0.9, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5396273732185364, + "learning_rate": 6.767619878448783e-08, + "loss": 0.0036, + "num_tokens": 3940539.0, + "reward": 5.361003398895264, + "reward_std": 3.229443311691284, + "rewards/fitness_reward/mean": 5.443634986877441, + "rewards/fitness_reward/std": 3.0867321491241455, + "rewards/kidney_reward/mean": -0.2784079611301422, + "rewards/kidney_reward/std": 1.3394166231155396, + "rewards/length2tails_reward/mean": 0.8541355133056641, + "rewards/length2tails_reward/std": 0.22073553502559662, + "rewards/thermo_reward/mean": -0.3139229416847229, + "rewards/thermo_reward/std": 2.3062264919281006, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 276.1875, + "completions/mean_terminated_length": 276.1875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.1407141126692295, + "epoch": 0.902, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4678702354431152, + "learning_rate": 6.517432360398556e-08, + "loss": 0.0686, + "num_tokens": 3949409.0, + "reward": 6.1275634765625, + "reward_std": 2.8987269401550293, + "rewards/fitness_reward/mean": 5.816352367401123, + "rewards/fitness_reward/std": 2.6810781955718994, + "rewards/kidney_reward/mean": 0.1129336804151535, + "rewards/kidney_reward/std": 1.4291346073150635, + "rewards/length2tails_reward/mean": 0.7642084360122681, + "rewards/length2tails_reward/std": 0.3605248034000397, + "rewards/thermo_reward/mean": 0.1273842751979828, + "rewards/thermo_reward/std": 1.9623202085494995, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.1197723550722003, + "epoch": 0.904, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.2243266105651855, + "learning_rate": 6.271801050810855e-08, + "loss": -0.0072, + "num_tokens": 3958099.0, + "reward": 5.805145263671875, + "reward_std": 2.994102954864502, + "rewards/fitness_reward/mean": 5.523082733154297, + "rewards/fitness_reward/std": 2.7376046180725098, + "rewards/kidney_reward/mean": 0.11638177931308746, + "rewards/kidney_reward/std": 1.1877796649932861, + "rewards/length2tails_reward/mean": 0.7494679689407349, + "rewards/length2tails_reward/std": 0.3458668291568756, + "rewards/thermo_reward/mean": 0.07301057130098343, + "rewards/thermo_reward/std": 2.142275094985962, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 269.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1348592070862651, + "epoch": 0.906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5708919167518616, + "learning_rate": 6.030737921409168e-08, + "loss": -0.0035, + "num_tokens": 3966770.0, + "reward": 6.293191909790039, + "reward_std": 2.521247148513794, + "rewards/fitness_reward/mean": 5.784482002258301, + "rewards/fitness_reward/std": 2.4514005184173584, + "rewards/kidney_reward/mean": 0.3663383722305298, + "rewards/kidney_reward/std": 1.3017356395721436, + "rewards/length2tails_reward/mean": 0.7055273652076721, + "rewards/length2tails_reward/std": 0.33975639939308167, + "rewards/thermo_reward/mean": 0.29831749200820923, + "rewards/thermo_reward/std": 1.8889341354370117, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 273.09375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.1793599370867014, + "epoch": 0.908, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.875615358352661, + "learning_rate": 5.794254721270331e-08, + "loss": 0.0356, + "num_tokens": 3975541.0, + "reward": 5.444811820983887, + "reward_std": 3.4904708862304688, + "rewards/fitness_reward/mean": 5.315083980560303, + "rewards/fitness_reward/std": 3.412541627883911, + "rewards/kidney_reward/mean": -0.10441018640995026, + "rewards/kidney_reward/std": 1.3502939939498901, + "rewards/length2tails_reward/mean": 0.8567314147949219, + "rewards/length2tails_reward/std": 0.21395272016525269, + "rewards/thermo_reward/mean": -0.06450112909078598, + "rewards/thermo_reward/std": 2.0255086421966553, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 272.65625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.1371596548706293, + "epoch": 0.91, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32508590817451477, + "learning_rate": 5.5623629762519e-08, + "loss": 0.0036, + "num_tokens": 3984298.0, + "reward": 5.952658653259277, + "reward_std": 2.358738899230957, + "rewards/fitness_reward/mean": 5.95086145401001, + "rewards/fitness_reward/std": 2.06325626373291, + "rewards/kidney_reward/mean": -0.1831625998020172, + "rewards/kidney_reward/std": 1.493977427482605, + "rewards/length2tails_reward/mean": 0.8945037126541138, + "rewards/length2tails_reward/std": 0.19499589502811432, + "rewards/thermo_reward/mean": -0.2604953646659851, + "rewards/thermo_reward/std": 2.373155117034912, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 279.28125, + "completions/mean_terminated_length": 279.28125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.1629155706614256, + "epoch": 0.912, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5588083267211914, + "learning_rate": 5.335073988430372e-08, + "loss": 0.085, + "num_tokens": 3993267.0, + "reward": 6.2450714111328125, + "reward_std": 3.284686803817749, + "rewards/fitness_reward/mean": 5.599737167358398, + "rewards/fitness_reward/std": 3.141002655029297, + "rewards/kidney_reward/mean": -0.018673259764909744, + "rewards/kidney_reward/std": 1.2502866983413696, + "rewards/length2tails_reward/mean": 0.840927004814148, + "rewards/length2tails_reward/std": 0.2280232310295105, + "rewards/thermo_reward/mean": 0.8888781666755676, + "rewards/thermo_reward/std": 1.6391123533248901, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 271.8125, + "completions/mean_terminated_length": 271.8125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.16157941427081823, + "epoch": 0.914, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3679721355438232, + "learning_rate": 5.1123988355503465e-08, + "loss": -0.0221, + "num_tokens": 4001997.0, + "reward": 5.608067512512207, + "reward_std": 2.661470651626587, + "rewards/fitness_reward/mean": 5.589663505554199, + "rewards/fitness_reward/std": 2.8181302547454834, + "rewards/kidney_reward/mean": -0.5036675930023193, + "rewards/kidney_reward/std": 1.1448392868041992, + "rewards/length2tails_reward/mean": 0.848773181438446, + "rewards/length2tails_reward/std": 0.24732929468154907, + "rewards/thermo_reward/mean": 0.11608822643756866, + "rewards/thermo_reward/std": 2.154752731323242, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 276.3125, + "completions/mean_terminated_length": 276.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.15516785997897387, + "epoch": 0.916, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.663477659225464, + "learning_rate": 4.8943483704846465e-08, + "loss": 0.0742, + "num_tokens": 4010871.0, + "reward": 6.066410064697266, + "reward_std": 3.334627151489258, + "rewards/fitness_reward/mean": 5.877503395080566, + "rewards/fitness_reward/std": 2.8087968826293945, + "rewards/kidney_reward/mean": 0.10705171525478363, + "rewards/kidney_reward/std": 1.444384217262268, + "rewards/length2tails_reward/mean": 0.7966011762619019, + "rewards/length2tails_reward/std": 0.29769471287727356, + "rewards/thermo_reward/mean": -0.12753897905349731, + "rewards/thermo_reward/std": 1.8615918159484863, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 272.34375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.1699973875656724, + "epoch": 0.918, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2838830947875977, + "learning_rate": 4.6809332207053074e-08, + "loss": 0.0194, + "num_tokens": 4019618.0, + "reward": 6.185791015625, + "reward_std": 2.2091588973999023, + "rewards/fitness_reward/mean": 6.132357597351074, + "rewards/fitness_reward/std": 2.0780248641967773, + "rewards/kidney_reward/mean": -0.12923404574394226, + "rewards/kidney_reward/std": 1.3461557626724243, + "rewards/length2tails_reward/mean": 0.8275649547576904, + "rewards/length2tails_reward/std": 0.27842438220977783, + "rewards/thermo_reward/mean": -0.17768177390098572, + "rewards/thermo_reward/std": 1.9903466701507568, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 263.46875, + "completions/mean_terminated_length": 263.46875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.1407340606674552, + "epoch": 0.92, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4002125263214111, + "learning_rate": 4.472163787765637e-08, + "loss": -0.1189, + "num_tokens": 4028081.0, + "reward": 6.638701915740967, + "reward_std": 2.2502400875091553, + "rewards/fitness_reward/mean": 6.228815078735352, + "rewards/fitness_reward/std": 2.0505142211914062, + "rewards/kidney_reward/mean": 0.14867182075977325, + "rewards/kidney_reward/std": 1.1441434621810913, + "rewards/length2tails_reward/mean": 0.7031423449516296, + "rewards/length2tails_reward/std": 0.3281797170639038, + "rewards/thermo_reward/mean": 0.31953129172325134, + "rewards/thermo_reward/std": 1.8455839157104492, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14164727926254272, + "epoch": 0.922, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1973295211791992, + "learning_rate": 4.2680502467932754e-08, + "loss": 0.0036, + "num_tokens": 4036780.0, + "reward": 6.230764389038086, + "reward_std": 2.1518642902374268, + "rewards/fitness_reward/mean": 6.048417568206787, + "rewards/fitness_reward/std": 2.0361173152923584, + "rewards/kidney_reward/mean": -0.2080516517162323, + "rewards/kidney_reward/std": 1.3549437522888184, + "rewards/length2tails_reward/mean": 0.8079833984375, + "rewards/length2tails_reward/std": 0.26826247572898865, + "rewards/thermo_reward/mean": 0.16875341534614563, + "rewards/thermo_reward/std": 1.9367470741271973, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 277.03125, + "completions/mean_terminated_length": 277.03125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.16016131080687046, + "epoch": 0.924, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6919748783111572, + "learning_rate": 4.0686025459942486e-08, + "loss": 0.0673, + "num_tokens": 4045677.0, + "reward": 5.369567394256592, + "reward_std": 3.270564317703247, + "rewards/fitness_reward/mean": 5.362431526184082, + "rewards/fitness_reward/std": 2.935969114303589, + "rewards/kidney_reward/mean": -0.11173248291015625, + "rewards/kidney_reward/std": 1.241031527519226, + "rewards/length2tails_reward/mean": 0.8059428334236145, + "rewards/length2tails_reward/std": 0.319585919380188, + "rewards/thermo_reward/mean": -0.27696692943573, + "rewards/thermo_reward/std": 2.009323835372925, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.13632575143128633, + "epoch": 0.926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3928329646587372, + "learning_rate": 3.87383040616811e-08, + "loss": 0.0068, + "num_tokens": 4054401.0, + "reward": 6.620509147644043, + "reward_std": 1.183544635772705, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.14931367337703705, + "rewards/kidney_reward/std": 1.3510866165161133, + "rewards/length2tails_reward/mean": 0.7820310592651367, + "rewards/length2tails_reward/std": 0.24996426701545715, + "rewards/thermo_reward/mean": 0.22867608070373535, + "rewards/thermo_reward/std": 1.7496261596679688, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13270280417054892, + "epoch": 0.928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5173652768135071, + "learning_rate": 3.6837433202341894e-08, + "loss": 0.001, + "num_tokens": 4063086.0, + "reward": 6.274802207946777, + "reward_std": 1.3407738208770752, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.46396902203559875, + "rewards/kidney_reward/std": 1.417636513710022, + "rewards/length2tails_reward/mean": 0.7583062052726746, + "rewards/length2tails_reward/std": 0.28290626406669617, + "rewards/thermo_reward/mean": -0.3421969711780548, + "rewards/thermo_reward/std": 1.9546772241592407, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1353480676189065, + "epoch": 0.93, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43471759557724, + "learning_rate": 3.4983505527688584e-08, + "loss": 0.0053, + "num_tokens": 4071795.0, + "reward": 6.438246250152588, + "reward_std": 1.6006256341934204, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.1561225801706314, + "rewards/kidney_reward/std": 1.3050709962844849, + "rewards/length2tails_reward/mean": 0.7958537340164185, + "rewards/length2tails_reward/std": 0.2570348083972931, + "rewards/thermo_reward/mean": -0.4481971561908722, + "rewards/thermo_reward/std": 1.9269288778305054, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 264.40625, + "completions/mean_terminated_length": 264.40625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.21487415861338377, + "epoch": 0.932, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.117380380630493, + "learning_rate": 3.317661139554062e-08, + "loss": -0.0721, + "num_tokens": 4080288.0, + "reward": 6.310624599456787, + "reward_std": 3.2768564224243164, + "rewards/fitness_reward/mean": 5.566933631896973, + "rewards/fitness_reward/std": 3.2391488552093506, + "rewards/kidney_reward/mean": 0.5489071011543274, + "rewards/kidney_reward/std": 1.3483208417892456, + "rewards/length2tails_reward/mean": 0.8428280353546143, + "rewards/length2tails_reward/std": 0.20550084114074707, + "rewards/thermo_reward/mean": 0.5170604586601257, + "rewards/thermo_reward/std": 1.59054434299469, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 280.9375, + "completions/mean_terminated_length": 280.9375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13101665768772364, + "epoch": 0.934, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.661982297897339, + "learning_rate": 3.141683887136892e-08, + "loss": 0.125, + "num_tokens": 4089310.0, + "reward": 5.751834869384766, + "reward_std": 3.421877145767212, + "rewards/fitness_reward/mean": 5.474588394165039, + "rewards/fitness_reward/std": 3.233222007751465, + "rewards/kidney_reward/mean": 0.019836775958538055, + "rewards/kidney_reward/std": 1.2497594356536865, + "rewards/length2tails_reward/mean": 0.6600824594497681, + "rewards/length2tails_reward/std": 0.35533779859542847, + "rewards/thermo_reward/mean": 0.20461499691009521, + "rewards/thermo_reward/std": 1.853097677230835, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 270.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13067772425711155, + "epoch": 0.936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5119224786758423, + "learning_rate": 2.9704273724003526e-08, + "loss": 0.0031, + "num_tokens": 4098003.0, + "reward": 6.646775245666504, + "reward_std": 1.3293075561523438, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.031656183302402496, + "rewards/kidney_reward/std": 1.3796350955963135, + "rewards/length2tails_reward/mean": 0.7634539604187012, + "rewards/length2tails_reward/std": 0.3263379633426666, + "rewards/thermo_reward/mean": -0.3024289608001709, + "rewards/thermo_reward/std": 2.1008477210998535, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13748420868068933, + "epoch": 0.938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.524431586265564, + "learning_rate": 2.8038999421453823e-08, + "loss": 0.0001, + "num_tokens": 4106742.0, + "reward": 6.620138168334961, + "reward_std": 1.317707896232605, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": -0.27108389139175415, + "rewards/kidney_reward/std": 1.3115421533584595, + "rewards/length2tails_reward/mean": 0.8660446405410767, + "rewards/length2tails_reward/std": 0.2033243179321289, + "rewards/thermo_reward/mean": -0.10425892472267151, + "rewards/thermo_reward/std": 2.0403130054473877, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.14136488363146782, + "epoch": 0.94, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.863619863986969, + "learning_rate": 2.642109712683971e-08, + "loss": 0.0127, + "num_tokens": 4115403.0, + "reward": 6.52547550201416, + "reward_std": 1.087640643119812, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.13498719036579132, + "rewards/kidney_reward/std": 1.1925514936447144, + "rewards/length2tails_reward/mean": 0.7910594940185547, + "rewards/length2tails_reward/std": 0.2733539044857025, + "rewards/thermo_reward/mean": -0.18620994687080383, + "rewards/thermo_reward/std": 1.8779382705688477, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 281.53125, + "completions/mean_terminated_length": 281.53125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14799009263515472, + "epoch": 0.942, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4865119457244873, + "learning_rate": 2.4850645694436734e-08, + "loss": 0.1213, + "num_tokens": 4124444.0, + "reward": 5.619724273681641, + "reward_std": 3.355104684829712, + "rewards/fitness_reward/mean": 5.491335868835449, + "rewards/fitness_reward/std": 3.193152666091919, + "rewards/kidney_reward/mean": 0.20664328336715698, + "rewards/kidney_reward/std": 1.4795762300491333, + "rewards/length2tails_reward/mean": 0.7287927865982056, + "rewards/length2tails_reward/std": 0.34719234704971313, + "rewards/thermo_reward/mean": -0.3142632246017456, + "rewards/thermo_reward/std": 1.987789273262024, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 276.125, + "completions/mean_terminated_length": 276.125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1640459280461073, + "epoch": 0.944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27881982922554016, + "learning_rate": 2.332772166583208e-08, + "loss": -0.0022, + "num_tokens": 4133312.0, + "reward": 6.9147844314575195, + "reward_std": 1.7263871431350708, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": 0.43178486824035645, + "rewards/kidney_reward/std": 1.449816107749939, + "rewards/length2tails_reward/mean": 0.8127265572547913, + "rewards/length2tails_reward/std": 0.30783113837242126, + "rewards/thermo_reward/mean": 0.014802634716033936, + "rewards/thermo_reward/std": 2.0539700984954834, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 754.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 285.46875, + "completions/mean_terminated_length": 270.3548278808594, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.16371324006468058, + "epoch": 0.946, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.666799545288086, + "learning_rate": 2.185239926619431e-08, + "loss": 0.2289, + "num_tokens": 4142479.0, + "reward": 6.779679298400879, + "reward_std": 2.5289268493652344, + "rewards/fitness_reward/mean": 6.130068778991699, + "rewards/fitness_reward/std": 2.0904550552368164, + "rewards/kidney_reward/mean": 0.32335078716278076, + "rewards/kidney_reward/std": 1.342889666557312, + "rewards/length2tails_reward/mean": 0.7477067112922668, + "rewards/length2tails_reward/std": 0.2941748797893524, + "rewards/thermo_reward/mean": 0.6020166277885437, + "rewards/thermo_reward/std": 1.8339732885360718, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.65625, + "completions/mean_terminated_length": 269.65625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12790144886821508, + "epoch": 0.948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6886346340179443, + "learning_rate": 2.0424750400655943e-08, + "loss": 0.0028, + "num_tokens": 4151140.0, + "reward": 6.2106170654296875, + "reward_std": 3.0731492042541504, + "rewards/fitness_reward/mean": 5.7802581787109375, + "rewards/fitness_reward/std": 2.463535785675049, + "rewards/kidney_reward/mean": -0.24863553047180176, + "rewards/kidney_reward/std": 1.4447722434997559, + "rewards/length2tails_reward/mean": 0.7356371283531189, + "rewards/length2tails_reward/std": 0.3002653121948242, + "rewards/thermo_reward/mean": 0.7415350079536438, + "rewards/thermo_reward/std": 1.7750520706176758, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 268.84375, + "completions/mean_terminated_length": 268.84375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12629235815256834, + "epoch": 0.95, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.221573829650879, + "learning_rate": 1.9044844650808467e-08, + "loss": -0.0024, + "num_tokens": 4159775.0, + "reward": 6.558210372924805, + "reward_std": 2.4178450107574463, + "rewards/fitness_reward/mean": 6.1215362548828125, + "rewards/fitness_reward/std": 2.136852741241455, + "rewards/kidney_reward/mean": 0.2523971498012543, + "rewards/kidney_reward/std": 1.3830926418304443, + "rewards/length2tails_reward/mean": 0.7020688056945801, + "rewards/length2tails_reward/std": 0.31335434317588806, + "rewards/thermo_reward/mean": 0.2699163258075714, + "rewards/thermo_reward/std": 1.4773310422897339, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.53125, + "completions/mean_terminated_length": 270.53125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.13433454185724258, + "epoch": 0.952, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0128010511398315, + "learning_rate": 1.771274927131139e-08, + "loss": 0.0026, + "num_tokens": 4168464.0, + "reward": 6.293290138244629, + "reward_std": 2.1775317192077637, + "rewards/fitness_reward/mean": 6.202960968017578, + "rewards/fitness_reward/std": 1.6983211040496826, + "rewards/kidney_reward/mean": -0.2153567522764206, + "rewards/kidney_reward/std": 1.2575360536575317, + "rewards/length2tails_reward/mean": 0.7736026048660278, + "rewards/length2tails_reward/std": 0.294264554977417, + "rewards/thermo_reward/mean": 0.00921345129609108, + "rewards/thermo_reward/std": 2.212282180786133, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.1272476715967059, + "epoch": 0.954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5237146019935608, + "learning_rate": 1.6428529186614193e-08, + "loss": -0.0039, + "num_tokens": 4177149.0, + "reward": 6.367445468902588, + "reward_std": 1.6114425659179688, + "rewards/fitness_reward/mean": 6.179342269897461, + "rewards/fitness_reward/std": 1.1073734760284424, + "rewards/kidney_reward/mean": 0.08609558641910553, + "rewards/kidney_reward/std": 1.3017667531967163, + "rewards/length2tails_reward/mean": 0.7121409177780151, + "rewards/length2tails_reward/std": 0.31732919812202454, + "rewards/thermo_reward/mean": -0.06595921516418457, + "rewards/thermo_reward/std": 1.9046858549118042, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.5625, + "completions/mean_terminated_length": 270.5625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12140200845897198, + "epoch": 0.956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39394357800483704, + "learning_rate": 1.519224698779198e-08, + "loss": 0.005, + "num_tokens": 4185839.0, + "reward": 6.108905792236328, + "reward_std": 2.085566759109497, + "rewards/fitness_reward/mean": 6.2113847732543945, + "rewards/fitness_reward/std": 1.6536391973495483, + "rewards/kidney_reward/mean": -0.22255373001098633, + "rewards/kidney_reward/std": 1.3791135549545288, + "rewards/length2tails_reward/mean": 0.8077136278152466, + "rewards/length2tails_reward/std": 0.24216946959495544, + "rewards/thermo_reward/mean": -0.3862607479095459, + "rewards/thermo_reward/std": 2.2672908306121826, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.1554853916168213, + "epoch": 0.958, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6892180442810059, + "learning_rate": 1.4003962929495127e-08, + "loss": -0.0541, + "num_tokens": 4194479.0, + "reward": 6.385907173156738, + "reward_std": 2.320415496826172, + "rewards/fitness_reward/mean": 6.11521577835083, + "rewards/fitness_reward/std": 2.1712775230407715, + "rewards/kidney_reward/mean": 0.04682411253452301, + "rewards/kidney_reward/std": 1.4674209356307983, + "rewards/length2tails_reward/mean": 0.8595852851867676, + "rewards/length2tails_reward/std": 0.2050134241580963, + "rewards/thermo_reward/mean": 0.06476667523384094, + "rewards/thermo_reward/std": 1.9430291652679443, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 284.5, + "completions/mean_terminated_length": 284.5, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.14631187915802002, + "epoch": 0.96, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1183857917785645, + "learning_rate": 1.2863734927012093e-08, + "loss": 0.1397, + "num_tokens": 4203615.0, + "reward": 5.392303943634033, + "reward_std": 3.975283622741699, + "rewards/fitness_reward/mean": 5.271598815917969, + "rewards/fitness_reward/std": 3.5448451042175293, + "rewards/kidney_reward/mean": -0.32605940103530884, + "rewards/kidney_reward/std": 1.1550776958465576, + "rewards/length2tails_reward/mean": 0.8411507606506348, + "rewards/length2tails_reward/std": 0.26199305057525635, + "rewards/thermo_reward/mean": 0.1468944400548935, + "rewards/thermo_reward/std": 1.9205420017242432, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 261.84375, + "completions/mean_terminated_length": 261.84375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.16815311275422573, + "epoch": 0.962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7249035835266113, + "learning_rate": 1.1771618553447215e-08, + "loss": -0.1241, + "num_tokens": 4212026.0, + "reward": 6.322161674499512, + "reward_std": 3.3183627128601074, + "rewards/fitness_reward/mean": 5.872105121612549, + "rewards/fitness_reward/std": 2.830014228820801, + "rewards/kidney_reward/mean": 0.09821343421936035, + "rewards/kidney_reward/std": 1.3843051195144653, + "rewards/length2tails_reward/mean": 0.8109922409057617, + "rewards/length2tails_reward/std": 0.29384127259254456, + "rewards/thermo_reward/mean": 0.39640364050865173, + "rewards/thermo_reward/std": 1.6114957332611084, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 272.09375, + "completions/mean_terminated_length": 272.09375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12907341960817575, + "epoch": 0.964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.533166766166687, + "learning_rate": 1.0727667037011667e-08, + "loss": 0.0007, + "num_tokens": 4220765.0, + "reward": 6.261979579925537, + "reward_std": 1.1733015775680542, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.2737240493297577, + "rewards/kidney_reward/std": 1.2405914068222046, + "rewards/length2tails_reward/mean": 0.8256810307502747, + "rewards/length2tails_reward/std": 0.2820776402950287, + "rewards/thermo_reward/mean": -0.5917750597000122, + "rewards/thermo_reward/std": 1.9592158794403076, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.21401519421488047, + "epoch": 0.966, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6284871101379395, + "learning_rate": 9.731931258429638e-09, + "loss": -0.01, + "num_tokens": 4229477.0, + "reward": 6.693005561828613, + "reward_std": 2.3508384227752686, + "rewards/fitness_reward/mean": 6.221892356872559, + "rewards/fitness_reward/std": 2.0896761417388916, + "rewards/kidney_reward/mean": 0.1586083471775055, + "rewards/kidney_reward/std": 1.414463996887207, + "rewards/length2tails_reward/mean": 0.8126205205917358, + "rewards/length2tails_reward/std": 0.2758491039276123, + "rewards/thermo_reward/mean": 0.3773079514503479, + "rewards/thermo_reward/std": 2.071603298187256, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 282.96875, + "completions/mean_terminated_length": 282.96875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.17435009684413671, + "epoch": 0.968, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.2569141387939453, + "learning_rate": 8.784459748458317e-09, + "loss": 0.0154, + "num_tokens": 4238564.0, + "reward": 5.633315086364746, + "reward_std": 3.1612935066223145, + "rewards/fitness_reward/mean": 5.347982883453369, + "rewards/fitness_reward/std": 3.3307552337646484, + "rewards/kidney_reward/mean": -0.03157354146242142, + "rewards/kidney_reward/std": 1.3760062456130981, + "rewards/length2tails_reward/mean": 0.7512047290802002, + "rewards/length2tails_reward/std": 0.30314382910728455, + "rewards/thermo_reward/mean": 0.22663527727127075, + "rewards/thermo_reward/std": 1.7661378383636475, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 279.15625, + "completions/mean_terminated_length": 279.15625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.17211697157472372, + "epoch": 0.97, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.859675884246826, + "learning_rate": 7.885298685522235e-09, + "loss": 0.1062, + "num_tokens": 4247529.0, + "reward": 6.191662788391113, + "reward_std": 3.2506253719329834, + "rewards/fitness_reward/mean": 5.795844078063965, + "rewards/fitness_reward/std": 2.7598862648010254, + "rewards/kidney_reward/mean": 0.04356342554092407, + "rewards/kidney_reward/std": 1.4145551919937134, + "rewards/length2tails_reward/mean": 0.8112553954124451, + "rewards/length2tails_reward/std": 0.27985844016075134, + "rewards/thermo_reward/mean": 0.3424462676048279, + "rewards/thermo_reward/std": 1.8757723569869995, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.34375, + "completions/mean_terminated_length": 271.34375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14102749805897474, + "epoch": 0.972, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0287251472473145, + "learning_rate": 7.034491893463057e-09, + "loss": 0.0043, + "num_tokens": 4256244.0, + "reward": 6.852804183959961, + "reward_std": 1.6488749980926514, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": 0.1278020739555359, + "rewards/kidney_reward/std": 1.3883448839187622, + "rewards/length2tails_reward/mean": 0.7885364890098572, + "rewards/length2tails_reward/std": 0.3108876943588257, + "rewards/thermo_reward/mean": 0.41289806365966797, + "rewards/thermo_reward/std": 1.8513990640640259, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.14220308419317007, + "epoch": 0.974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5835313200950623, + "learning_rate": 6.23208083940363e-09, + "loss": 0.0032, + "num_tokens": 4264972.0, + "reward": 6.557872772216797, + "reward_std": 1.528399109840393, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.31340882182121277, + "rewards/kidney_reward/std": 1.2353618144989014, + "rewards/length2tails_reward/mean": 0.8489131331443787, + "rewards/length2tails_reward/std": 0.19918689131736755, + "rewards/thermo_reward/mean": 0.028078734874725342, + "rewards/thermo_reward/std": 1.8716614246368408, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 271.15625, + "completions/mean_terminated_length": 271.15625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.13235425390303135, + "epoch": 0.976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6584662795066833, + "learning_rate": 5.47810463172671e-09, + "loss": 0.0031, + "num_tokens": 4273681.0, + "reward": 6.674538612365723, + "reward_std": 1.3704478740692139, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.00790829211473465, + "rewards/kidney_reward/std": 1.332300066947937, + "rewards/length2tails_reward/mean": 0.802370548248291, + "rewards/length2tails_reward/std": 0.25463488698005676, + "rewards/thermo_reward/mean": -0.2426123023033142, + "rewards/thermo_reward/std": 2.2439165115356445, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.12326779402792454, + "epoch": 0.978, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37450870871543884, + "learning_rate": 4.772600018168815e-09, + "loss": 0.0022, + "num_tokens": 4282389.0, + "reward": 6.011283874511719, + "reward_std": 1.5877715349197388, + "rewards/fitness_reward/mean": 6.38532018661499, + "rewards/fitness_reward/std": 0.8105144500732422, + "rewards/kidney_reward/mean": -0.4957900643348694, + "rewards/kidney_reward/std": 1.1033035516738892, + "rewards/length2tails_reward/mean": 0.7796015739440918, + "rewards/length2tails_reward/std": 0.30093348026275635, + "rewards/thermo_reward/mean": -0.6420827507972717, + "rewards/thermo_reward/std": 2.05932354927063, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.1382999373599887, + "epoch": 0.98, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1265020370483398, + "learning_rate": 4.115601384029666e-09, + "loss": -0.0023, + "num_tokens": 4291113.0, + "reward": 7.089540958404541, + "reward_std": 1.1871302127838135, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.15937963128089905, + "rewards/kidney_reward/std": 1.3904200792312622, + "rewards/length2tails_reward/mean": 0.8089946508407593, + "rewards/length2tails_reward/std": 0.25792431831359863, + "rewards/thermo_reward/mean": 0.43260854482650757, + "rewards/thermo_reward/std": 1.7895396947860718, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 277.03125, + "completions/mean_terminated_length": 277.03125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.13852917868644, + "epoch": 0.982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1202490329742432, + "learning_rate": 3.5071407504956294e-09, + "loss": 0.078, + "num_tokens": 4300010.0, + "reward": 6.318650722503662, + "reward_std": 1.5724629163742065, + "rewards/fitness_reward/mean": 6.488308906555176, + "rewards/fitness_reward/std": 0.5825939178466797, + "rewards/kidney_reward/mean": -0.29085487127304077, + "rewards/kidney_reward/std": 1.2370598316192627, + "rewards/length2tails_reward/mean": 0.7769042253494263, + "rewards/length2tails_reward/std": 0.30063578486442566, + "rewards/thermo_reward/mean": -0.4369143545627594, + "rewards/thermo_reward/std": 2.1379871368408203, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 270.78125, + "completions/mean_terminated_length": 270.78125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14235816057771444, + "epoch": 0.984, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0515778064727783, + "learning_rate": 2.947247773079753e-09, + "loss": 0.0048, + "num_tokens": 4308707.0, + "reward": 6.565581798553467, + "reward_std": 2.419142007827759, + "rewards/fitness_reward/mean": 6.200839042663574, + "rewards/fitness_reward/std": 2.2087697982788086, + "rewards/kidney_reward/mean": -0.203052818775177, + "rewards/kidney_reward/std": 1.2326304912567139, + "rewards/length2tails_reward/mean": 0.8171231150627136, + "rewards/length2tails_reward/std": 0.23487749695777893, + "rewards/thermo_reward/mean": 0.5239765048027039, + "rewards/thermo_reward/std": 1.8486820459365845, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.12068606168031693, + "epoch": 0.986, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6563858389854431, + "learning_rate": 2.435949740175802e-09, + "loss": -0.0026, + "num_tokens": 4317372.0, + "reward": 6.417881011962891, + "reward_std": 2.5073564052581787, + "rewards/fitness_reward/mean": 6.17474365234375, + "rewards/fitness_reward/std": 1.8490768671035767, + "rewards/kidney_reward/mean": -0.0659375786781311, + "rewards/kidney_reward/std": 1.3378238677978516, + "rewards/length2tails_reward/mean": 0.7271683216094971, + "rewards/length2tails_reward/std": 0.32772964239120483, + "rewards/thermo_reward/mean": 0.1886269599199295, + "rewards/thermo_reward/std": 2.022970199584961, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 286.78125, + "completions/mean_terminated_length": 286.78125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.18350693583488464, + "epoch": 0.988, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.7464146614074707, + "learning_rate": 1.973271571728441e-09, + "loss": 0.0335, + "num_tokens": 4326581.0, + "reward": 6.034966468811035, + "reward_std": 2.6427571773529053, + "rewards/fitness_reward/mean": 5.688358783721924, + "rewards/fitness_reward/std": 2.8081235885620117, + "rewards/kidney_reward/mean": 0.0014158524572849274, + "rewards/kidney_reward/std": 1.2659722566604614, + "rewards/length2tails_reward/mean": 0.8527138829231262, + "rewards/length2tails_reward/std": 0.21939502656459808, + "rewards/thermo_reward/mean": 0.26544201374053955, + "rewards/thermo_reward/std": 1.916554570198059, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 268.96875, + "completions/mean_terminated_length": 268.96875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.13475322257727385, + "epoch": 0.99, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8051790595054626, + "learning_rate": 1.559235818018978e-09, + "loss": 0.0115, + "num_tokens": 4335220.0, + "reward": 5.8860626220703125, + "reward_std": 3.1279499530792236, + "rewards/fitness_reward/mean": 5.766759872436523, + "rewards/fitness_reward/std": 2.515184164047241, + "rewards/kidney_reward/mean": 0.12276924401521683, + "rewards/kidney_reward/std": 1.2952481508255005, + "rewards/length2tails_reward/mean": 0.7117717266082764, + "rewards/length2tails_reward/std": 0.3357178866863251, + "rewards/thermo_reward/mean": -0.24005013704299927, + "rewards/thermo_reward/std": 2.0688977241516113, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 285.3125, + "completions/mean_terminated_length": 285.3125, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.18036719225347042, + "epoch": 0.992, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.536488056182861, + "learning_rate": 1.193862658566025e-09, + "loss": 0.1906, + "num_tokens": 4344382.0, + "reward": 6.254662990570068, + "reward_std": 2.8364017009735107, + "rewards/fitness_reward/mean": 5.82073974609375, + "rewards/fitness_reward/std": 2.669975996017456, + "rewards/kidney_reward/mean": 0.12117721140384674, + "rewards/kidney_reward/std": 1.3783304691314697, + "rewards/length2tails_reward/mean": 0.8040266633033752, + "rewards/length2tails_reward/std": 0.24874736368656158, + "rewards/thermo_reward/mean": 0.34465593099594116, + "rewards/thermo_reward/std": 1.8117518424987793, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 265.4375, + "completions/mean_terminated_length": 265.4375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.14075111038982868, + "epoch": 0.994, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8132911920547485, + "learning_rate": 8.771699011416167e-10, + "loss": -0.0563, + "num_tokens": 4352908.0, + "reward": 5.731771469116211, + "reward_std": 2.852626085281372, + "rewards/fitness_reward/mean": 5.548877239227295, + "rewards/fitness_reward/std": 2.9979569911956787, + "rewards/kidney_reward/mean": -0.32727035880088806, + "rewards/kidney_reward/std": 1.3308184146881104, + "rewards/length2tails_reward/mean": 0.8396377563476562, + "rewards/length2tails_reward/std": 0.1741836816072464, + "rewards/thermo_reward/mean": 0.27323901653289795, + "rewards/thermo_reward/std": 1.6997170448303223, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 265.9375, + "completions/mean_terminated_length": 265.9375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.13753656949847937, + "epoch": 0.996, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.766315758228302, + "learning_rate": 6.091729809042379e-10, + "loss": -0.0831, + "num_tokens": 4361450.0, + "reward": 5.796799182891846, + "reward_std": 3.151379108428955, + "rewards/fitness_reward/mean": 5.714599609375, + "rewards/fitness_reward/std": 2.720996141433716, + "rewards/kidney_reward/mean": -0.09053881466388702, + "rewards/kidney_reward/std": 1.1235448122024536, + "rewards/length2tails_reward/mean": 0.7273579835891724, + "rewards/length2tails_reward/std": 0.32541486620903015, + "rewards/thermo_reward/mean": -0.10874120146036148, + "rewards/thermo_reward/std": 1.9039497375488281, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 271.4375, + "completions/mean_terminated_length": 271.4375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.1267131119966507, + "epoch": 0.998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5692701935768127, + "learning_rate": 3.8988495964564774e-10, + "loss": 0.0018, + "num_tokens": 4370168.0, + "reward": 6.738133430480957, + "reward_std": 1.3152570724487305, + "rewards/fitness_reward/mean": 6.5912981033325195, + "rewards/fitness_reward/std": 0.0, + "rewards/kidney_reward/mean": 0.08398180454969406, + "rewards/kidney_reward/std": 1.4386367797851562, + "rewards/length2tails_reward/mean": 0.7613844871520996, + "rewards/length2tails_reward/std": 0.31253892183303833, + "rewards/thermo_reward/mean": -0.1710038036108017, + "rewards/thermo_reward/std": 1.9834370613098145, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 271.53125, + "completions/mean_terminated_length": 271.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.14032446220517159, + "epoch": 1.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2252322435379028, + "learning_rate": 2.1931652515450038e-10, + "loss": 0.0056, + "num_tokens": 4378889.0, + "reward": 6.141080379486084, + "reward_std": 3.0671908855438232, + "rewards/fitness_reward/mean": 5.957330703735352, + "rewards/fitness_reward/std": 2.511507272720337, + "rewards/kidney_reward/mean": 0.047492511570453644, + "rewards/kidney_reward/std": 1.287542462348938, + "rewards/length2tails_reward/mean": 0.7975193858146667, + "rewards/length2tails_reward/std": 0.21932071447372437, + "rewards/thermo_reward/mean": -0.07875257730484009, + "rewards/thermo_reward/std": 1.930476427078247, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 4378889, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}